{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2/runs/Sep04_03-44-32_gx12", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 40, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 20, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": { "task_type": "CAUSAL_LM", "peft_type": "LORA", "auto_mapping": null, "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", "revision": null, "inference_mode": false, "r": 16, "target_modules": [ "o_proj", "down_proj", "v_proj", "up_proj", "q_proj", "gate_proj", "k_proj" ], "exclude_modules": null, "lora_alpha": 16, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", "use_rslora": true, "modules_to_save": null, "init_lora_weights": true, "layers_to_transform": null, "layers_pattern": null, "rank_pattern": {}, "alpha_pattern": {}, "megatron_config": null, "megatron_core": "megatron.core", "trainable_token_indices": null, "loftq_config": {}, "eva_config": null, "corda_config": null, "use_dora": false, "layer_replication": null, "runtime_config": { "ephemeral_gpu_offload": false }, "lora_bias": false }, "flops": { "eval": 224996302651284480, "train": 5.201887604744794e+16, "total": 2.770151786987324e+17 }, "total_energy": 80.49119, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:44:54.030340", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 513.7015351300394, "timestamp": "2025-09-04 03:44:54.032582", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:44:54.153228", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.22628124058246613, "timestamp": "2025-09-04 03:44:54.166141", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:44:54.277663", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.18759363889694214, "timestamp": "2025-09-04 03:44:54.296326", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:44:54.403899", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.21276673674583435, "timestamp": "2025-09-04 03:44:54.422364", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:44:54.529920", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.20896945893764496, "timestamp": "2025-09-04 03:44:54.584227", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:44:54.682359", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.15049788355827332, "timestamp": "2025-09-04 03:44:54.701147", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:44:54.795660", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.221171572804451, "timestamp": "2025-09-04 03:44:54.812689", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:44:54.925572", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.13799867033958435, "timestamp": "2025-09-04 03:44:54.945627", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:44:55.048805", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.254111111164093, "timestamp": "2025-09-04 03:44:55.068403", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:44:55.176185", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.2516261041164398, "timestamp": "2025-09-04 03:44:55.197753", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:44:55.292267", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.11826080828905106, "timestamp": "2025-09-04 03:44:55.309480", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:44:55.392164", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.3149326741695404, "timestamp": "2025-09-04 03:44:55.404837", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:44:55.501241", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.1865367889404297, "timestamp": "2025-09-04 03:44:55.519423", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:44:55.618360", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.22903558611869812, "timestamp": "2025-09-04 03:44:55.638547", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:44:55.727492", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.1957457810640335, "timestamp": "2025-09-04 03:44:55.743023", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:44:55.850708", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.26458948850631714, "timestamp": "2025-09-04 03:44:55.870711", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:44:55.956573", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.19251839816570282, "timestamp": "2025-09-04 03:44:55.972768", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:44:56.065326", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.169407457113266, "timestamp": "2025-09-04 03:44:56.084337", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:44:56.164607", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.1738697588443756, "timestamp": "2025-09-04 03:44:56.178386", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:44:56.250786", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.12069570273160934, "timestamp": "2025-09-04 03:44:56.263511", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:44:56.374549", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.28613629937171936, "timestamp": "2025-09-04 03:44:56.395709", "step": 20, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:04.776963", "step": 20, "epoch": 1 }, { "type": "pplx", "content": 459.69697261034077, "timestamp": "2025-09-04 03:45:04.779321", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:04.883567", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.0930766835808754, "timestamp": "2025-09-04 03:45:04.905592", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1392 ], "flops": 27840169073088.0 }, "timestamp": "2025-09-04 03:45:05.111656", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.16422075033187866, "timestamp": "2025-09-04 03:45:05.150856", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:45:05.262285", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.15494759380817413, "timestamp": "2025-09-04 03:45:05.282894", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 03:45:05.496961", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.1941477507352829, "timestamp": "2025-09-04 03:45:05.538559", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:05.640198", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.046016380190849304, "timestamp": "2025-09-04 03:45:05.661299", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:05.770082", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.26754552125930786, "timestamp": "2025-09-04 03:45:05.789791", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:05.887422", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.2157849818468094, "timestamp": "2025-09-04 03:45:05.903902", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:06.003259", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.09500601142644882, "timestamp": "2025-09-04 03:45:06.022367", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:06.127436", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.2806503176689148, "timestamp": "2025-09-04 03:45:06.149677", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:45:06.261111", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.147545725107193, "timestamp": "2025-09-04 03:45:06.281302", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:06.383742", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.0946376770734787, "timestamp": "2025-09-04 03:45:06.402770", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:06.498965", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.07105350494384766, "timestamp": "2025-09-04 03:45:06.516556", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:45:06.595117", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.05865024775266647, "timestamp": "2025-09-04 03:45:06.609671", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:06.700887", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.04186881706118584, "timestamp": "2025-09-04 03:45:06.717379", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:06.820494", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.1130460873246193, "timestamp": "2025-09-04 03:45:06.839402", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:06.935561", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.08032827824354172, "timestamp": "2025-09-04 03:45:06.953667", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:07.060582", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.10274317115545273, "timestamp": "2025-09-04 03:45:07.082672", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:07.192812", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.05926857143640518, "timestamp": "2025-09-04 03:45:07.212899", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:45:07.301385", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.09572148323059082, "timestamp": "2025-09-04 03:45:07.316491", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:45:07.394036", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.06614067405462265, "timestamp": "2025-09-04 03:45:07.408562", "step": 40, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:15.977735", "step": 40, "epoch": 1 }, { "type": "pplx", "content": 389.98537419869405, "timestamp": "2025-09-04 03:45:15.980248", "step": 40, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 40", "timestamp": "2025-09-04 03:45:16.522945", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:45:16.607576", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.0893501564860344, "timestamp": "2025-09-04 03:45:16.624424", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:45:16.725320", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.10664358735084534, "timestamp": "2025-09-04 03:45:16.743948", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:16.850040", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.11798786371946335, "timestamp": "2025-09-04 03:45:16.869875", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:45:16.959614", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.09886734187602997, "timestamp": "2025-09-04 03:45:16.975202", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:45:17.049375", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.040978167206048965, "timestamp": "2025-09-04 03:45:17.064184", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:17.173784", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.06903672963380814, "timestamp": "2025-09-04 03:45:17.193812", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:17.296985", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.02300487831234932, "timestamp": "2025-09-04 03:45:17.316264", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:17.408720", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.0908779725432396, "timestamp": "2025-09-04 03:45:17.426033", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:17.523711", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.08729575574398041, "timestamp": "2025-09-04 03:45:17.543919", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:17.637324", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.0870480164885521, "timestamp": "2025-09-04 03:45:17.654175", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:17.756185", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.050713177770376205, "timestamp": "2025-09-04 03:45:17.775418", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:45:17.857774", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.04926443472504616, "timestamp": "2025-09-04 03:45:17.873580", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:17.976932", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.014587458223104477, "timestamp": "2025-09-04 03:45:17.998752", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:18.096464", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.0563230998814106, "timestamp": "2025-09-04 03:45:18.113944", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:18.204053", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.052430395036935806, "timestamp": "2025-09-04 03:45:18.220905", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:45:18.299562", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.08462968468666077, "timestamp": "2025-09-04 03:45:18.314267", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:45:18.410745", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.06088142469525337, "timestamp": "2025-09-04 03:45:18.431360", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:18.531885", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.10583172738552094, "timestamp": "2025-09-04 03:45:18.550360", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:18.645711", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.05197317525744438, "timestamp": "2025-09-04 03:45:18.663022", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:18.767990", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.0938020795583725, "timestamp": "2025-09-04 03:45:18.788566", "step": 60, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:27.161836", "step": 60, "epoch": 1 }, { "type": "pplx", "content": 344.9719250300418, "timestamp": "2025-09-04 03:45:27.164144", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:27.256830", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.047754935920238495, "timestamp": "2025-09-04 03:45:27.275797", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:45:27.384552", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.08348787575960159, "timestamp": "2025-09-04 03:45:27.404855", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:27.496674", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.048718009144067764, "timestamp": "2025-09-04 03:45:27.513341", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 03:45:27.732444", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.06368561834096909, "timestamp": "2025-09-04 03:45:27.775457", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:45:27.866602", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.047636643052101135, "timestamp": "2025-09-04 03:45:27.885576", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:27.994542", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.09618725627660751, "timestamp": "2025-09-04 03:45:28.014601", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 03:45:28.146160", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.09339220821857452, "timestamp": "2025-09-04 03:45:28.169472", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:28.276347", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.05055764317512512, "timestamp": "2025-09-04 03:45:28.296999", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:45:28.379869", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.06613060086965561, "timestamp": "2025-09-04 03:45:28.396302", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:28.498772", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.08002032339572906, "timestamp": "2025-09-04 03:45:28.517928", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:45:28.595425", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.09714193642139435, "timestamp": "2025-09-04 03:45:28.609410", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:28.716648", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.048158854246139526, "timestamp": "2025-09-04 03:45:28.737622", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:28.833604", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.17001864314079285, "timestamp": "2025-09-04 03:45:28.854016", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:45:28.963395", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.025268767029047012, "timestamp": "2025-09-04 03:45:28.983767", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:29.089863", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.025647073984146118, "timestamp": "2025-09-04 03:45:29.109647", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:45:29.193050", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.06310443580150604, "timestamp": "2025-09-04 03:45:29.208797", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:45:29.289171", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.006233109161257744, "timestamp": "2025-09-04 03:45:29.305528", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:29.403738", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.03743808716535568, "timestamp": "2025-09-04 03:45:29.422112", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:45:29.532220", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.06183940917253494, "timestamp": "2025-09-04 03:45:29.552634", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:29.658591", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.03081091120839119, "timestamp": "2025-09-04 03:45:29.679007", "step": 80, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:38.054098", "step": 80, "epoch": 1 }, { "type": "pplx", "content": 318.24303809489555, "timestamp": "2025-09-04 03:45:38.056443", "step": 80, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 80", "timestamp": "2025-09-04 03:45:38.556351", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:38.660607", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.0675966814160347, "timestamp": "2025-09-04 03:45:38.682606", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:38.778116", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.07781155407428741, "timestamp": "2025-09-04 03:45:38.795394", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:38.899758", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.06105361878871918, "timestamp": "2025-09-04 03:45:38.918706", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:39.026173", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.049994468688964844, "timestamp": "2025-09-04 03:45:39.046706", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:39.138861", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.1184161901473999, "timestamp": "2025-09-04 03:45:39.157849", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:45:39.243936", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.07634230703115463, "timestamp": "2025-09-04 03:45:39.259348", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:39.361762", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.03923124074935913, "timestamp": "2025-09-04 03:45:39.380754", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:45:39.458872", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.0754866749048233, "timestamp": "2025-09-04 03:45:39.473629", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:45:39.542590", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.049534834921360016, "timestamp": "2025-09-04 03:45:39.556461", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:45:39.634576", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.09451133012771606, "timestamp": "2025-09-04 03:45:39.648463", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:39.750807", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.1034846305847168, "timestamp": "2025-09-04 03:45:39.770033", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:45:39.855459", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.060452286154031754, "timestamp": "2025-09-04 03:45:39.871436", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:39.961786", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.05689983442425728, "timestamp": "2025-09-04 03:45:39.980357", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:45:40.109208", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.009844101965427399, "timestamp": "2025-09-04 03:45:40.132173", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:45:40.214872", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.08469025045633316, "timestamp": "2025-09-04 03:45:40.229769", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:40.328739", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.10152354091405869, "timestamp": "2025-09-04 03:45:40.347831", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:45:40.445260", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.0582207627594471, "timestamp": "2025-09-04 03:45:40.465784", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:45:40.580598", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.05192510411143303, "timestamp": "2025-09-04 03:45:40.600367", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:40.700989", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.09743145853281021, "timestamp": "2025-09-04 03:45:40.719330", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:40.815781", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.07235053181648254, "timestamp": "2025-09-04 03:45:40.833849", "step": 100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:49.188649", "step": 100, "epoch": 1 }, { "type": "pplx", "content": 307.3854100186044, "timestamp": "2025-09-04 03:45:49.190883", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:45:49.324428", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.04918394982814789, "timestamp": "2025-09-04 03:45:49.353243", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:45:49.444040", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.12214594334363937, "timestamp": "2025-09-04 03:45:49.460765", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:45:49.561386", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.044912759214639664, "timestamp": "2025-09-04 03:45:49.579949", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:49.681534", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.14099526405334473, "timestamp": "2025-09-04 03:45:49.701325", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:45:49.783498", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.0638093501329422, "timestamp": "2025-09-04 03:45:49.800247", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:45:49.904552", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.013638557866215706, "timestamp": "2025-09-04 03:45:49.923592", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:45:50.006927", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.07801222056150436, "timestamp": "2025-09-04 03:45:50.022042", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:45:50.144157", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.06590745598077774, "timestamp": "2025-09-04 03:45:50.166685", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:50.262360", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.11835378408432007, "timestamp": "2025-09-04 03:45:50.282746", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:50.376627", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.12223128229379654, "timestamp": "2025-09-04 03:45:50.393565", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:45:50.504278", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.1280670166015625, "timestamp": "2025-09-04 03:45:50.524729", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:45:50.623814", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.06618773937225342, "timestamp": "2025-09-04 03:45:50.642958", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:50.734068", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.04645576328039169, "timestamp": "2025-09-04 03:45:50.752647", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:45:50.829181", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.13508065044879913, "timestamp": "2025-09-04 03:45:50.842752", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:45:50.934855", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.0594618059694767, "timestamp": "2025-09-04 03:45:50.951746", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:45:51.053664", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.11379887908697128, "timestamp": "2025-09-04 03:45:51.073378", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:45:51.164518", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.039490919560194016, "timestamp": "2025-09-04 03:45:51.183413", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:45:51.291693", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.04585837572813034, "timestamp": "2025-09-04 03:45:51.311758", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:45:51.407070", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.07603704929351807, "timestamp": "2025-09-04 03:45:51.424283", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:45:51.540887", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.016811968758702278, "timestamp": "2025-09-04 03:45:51.563695", "step": 120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:45:59.943776", "step": 120, "epoch": 1 }, { "type": "pplx", "content": 299.89610834083794, "timestamp": "2025-09-04 03:45:59.945489", "step": 120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 120", "timestamp": "2025-09-04 03:46:00.290401", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 03:46:00.476561", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.08362013101577759, "timestamp": "2025-09-04 03:46:00.514696", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:46:00.596482", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.059237729758024216, "timestamp": "2025-09-04 03:46:00.611683", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:46:00.727922", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.039004936814308167, "timestamp": "2025-09-04 03:46:00.749930", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:46:00.824487", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.11705737560987473, "timestamp": "2025-09-04 03:46:00.838610", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:00.914119", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.06630095839500427, "timestamp": "2025-09-04 03:46:00.929469", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:46:01.019324", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.04516802728176117, "timestamp": "2025-09-04 03:46:01.035861", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:01.144582", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.04330586642026901, "timestamp": "2025-09-04 03:46:01.165051", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:46:01.259475", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.061153244227170944, "timestamp": "2025-09-04 03:46:01.277402", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:01.374266", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.11837725341320038, "timestamp": "2025-09-04 03:46:01.394579", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:46:01.476506", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.02547391690313816, "timestamp": "2025-09-04 03:46:01.491380", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:01.598388", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.03671078011393547, "timestamp": "2025-09-04 03:46:01.618545", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:46:01.711084", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.15452733635902405, "timestamp": "2025-09-04 03:46:01.728810", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:01.804783", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.07617174834012985, "timestamp": "2025-09-04 03:46:01.820045", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:01.920338", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.062410350888967514, "timestamp": "2025-09-04 03:46:01.938924", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:46:02.054545", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.0330992229282856, "timestamp": "2025-09-04 03:46:02.076694", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:46:02.186634", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.029934488236904144, "timestamp": "2025-09-04 03:46:02.207952", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:02.306911", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.048758380115032196, "timestamp": "2025-09-04 03:46:02.327997", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:02.424443", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.11269187927246094, "timestamp": "2025-09-04 03:46:02.441749", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:02.543693", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.0775558203458786, "timestamp": "2025-09-04 03:46:02.562600", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:02.658416", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.05858410522341728, "timestamp": "2025-09-04 03:46:02.676400", "step": 140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:46:11.045850", "step": 140, "epoch": 1 }, { "type": "pplx", "content": 295.5688961078636, "timestamp": "2025-09-04 03:46:11.048049", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:11.145509", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.034822121262550354, "timestamp": "2025-09-04 03:46:11.166258", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:11.275463", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.025513645261526108, "timestamp": "2025-09-04 03:46:11.295707", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:11.405349", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.022908439859747887, "timestamp": "2025-09-04 03:46:11.425606", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:11.531596", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.05994102731347084, "timestamp": "2025-09-04 03:46:11.552142", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 03:46:11.680734", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.06833264976739883, "timestamp": "2025-09-04 03:46:11.707754", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:11.809626", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.07911453396081924, "timestamp": "2025-09-04 03:46:11.828239", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:11.934765", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.08064809441566467, "timestamp": "2025-09-04 03:46:11.954473", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:12.062843", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.03310004621744156, "timestamp": "2025-09-04 03:46:12.083696", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:46:12.166474", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.03188398852944374, "timestamp": "2025-09-04 03:46:12.183268", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:12.293783", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.03804198279976845, "timestamp": "2025-09-04 03:46:12.314141", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:12.418609", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.03391795977950096, "timestamp": "2025-09-04 03:46:12.437588", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:12.515641", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.024908579885959625, "timestamp": "2025-09-04 03:46:12.530339", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:46:12.621956", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.0160362645983696, "timestamp": "2025-09-04 03:46:12.640839", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:12.741410", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.030184214934706688, "timestamp": "2025-09-04 03:46:12.760113", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:12.839419", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.03809965401887894, "timestamp": "2025-09-04 03:46:12.853334", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:12.953947", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.03310762345790863, "timestamp": "2025-09-04 03:46:12.973351", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:13.079957", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.05042213574051857, "timestamp": "2025-09-04 03:46:13.102326", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:46:13.223808", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.03846399486064911, "timestamp": "2025-09-04 03:46:13.245799", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:13.354628", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.030467255041003227, "timestamp": "2025-09-04 03:46:13.374922", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:13.482931", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.07905561476945877, "timestamp": "2025-09-04 03:46:13.503834", "step": 160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:46:21.876993", "step": 160, "epoch": 1 }, { "type": "pplx", "content": 299.0262854255139, "timestamp": "2025-09-04 03:46:21.879135", "step": 160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 160", "timestamp": "2025-09-04 03:46:22.373385", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:22.469238", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.10933341830968857, "timestamp": "2025-09-04 03:46:22.489584", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:46:22.572640", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.029027054086327553, "timestamp": "2025-09-04 03:46:22.587595", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:22.688492", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.012869374826550484, "timestamp": "2025-09-04 03:46:22.707182", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:46:22.789587", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.03408244624733925, "timestamp": "2025-09-04 03:46:22.805117", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:22.897367", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.024805987253785133, "timestamp": "2025-09-04 03:46:22.916271", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:46:23.006515", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.02760004997253418, "timestamp": "2025-09-04 03:46:23.023015", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:46:23.115490", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.06649627536535263, "timestamp": "2025-09-04 03:46:23.132373", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:23.236020", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.08248773962259293, "timestamp": "2025-09-04 03:46:23.255843", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:23.364354", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.030205007642507553, "timestamp": "2025-09-04 03:46:23.386069", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:23.480780", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.07514587044715881, "timestamp": "2025-09-04 03:46:23.498073", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:46:23.576363", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.07391712814569473, "timestamp": "2025-09-04 03:46:23.589072", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:23.689317", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.04588964581489563, "timestamp": "2025-09-04 03:46:23.708742", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:23.787355", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.01995799131691456, "timestamp": "2025-09-04 03:46:23.802655", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:23.914759", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.04095141217112541, "timestamp": "2025-09-04 03:46:23.934809", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:24.040047", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.07867230474948883, "timestamp": "2025-09-04 03:46:24.060041", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:46:24.144862", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.08477169275283813, "timestamp": "2025-09-04 03:46:24.161034", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:24.258703", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.048271287232637405, "timestamp": "2025-09-04 03:46:24.279194", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:24.380449", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.056753601878881454, "timestamp": "2025-09-04 03:46:24.399137", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:46:24.481124", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.0256480872631073, "timestamp": "2025-09-04 03:46:24.496079", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:24.605160", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.00964992307126522, "timestamp": "2025-09-04 03:46:24.626113", "step": 180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:46:32.989309", "step": 180, "epoch": 1 }, { "type": "pplx", "content": 304.3604161862801, "timestamp": "2025-09-04 03:46:32.990886", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:33.063611", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.08969972282648087, "timestamp": "2025-09-04 03:46:33.078807", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:33.186793", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.029566926881670952, "timestamp": "2025-09-04 03:46:33.207034", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:33.309505", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.016738848760724068, "timestamp": "2025-09-04 03:46:33.328509", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:33.404480", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.030050721019506454, "timestamp": "2025-09-04 03:46:33.419146", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:46:33.538599", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.032734520733356476, "timestamp": "2025-09-04 03:46:33.562256", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:33.660578", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.03777821734547615, "timestamp": "2025-09-04 03:46:33.678971", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:33.785505", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.07701154798269272, "timestamp": "2025-09-04 03:46:33.805660", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:33.911868", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.040181923657655716, "timestamp": "2025-09-04 03:46:33.932359", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:34.013114", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.10895628482103348, "timestamp": "2025-09-04 03:46:34.028150", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:46:34.111556", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.03597872704267502, "timestamp": "2025-09-04 03:46:34.126652", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:46:34.223903", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.020103169605135918, "timestamp": "2025-09-04 03:46:34.241188", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:34.350326", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.04040881618857384, "timestamp": "2025-09-04 03:46:34.371538", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:46:34.478562", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.05105682834982872, "timestamp": "2025-09-04 03:46:34.501219", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:34.606829", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.013259065337479115, "timestamp": "2025-09-04 03:46:34.626713", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:34.721624", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.05866575241088867, "timestamp": "2025-09-04 03:46:34.738863", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:34.815582", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.0314386822283268, "timestamp": "2025-09-04 03:46:34.830274", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:34.928107", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.08559767156839371, "timestamp": "2025-09-04 03:46:34.948472", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:35.041821", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.051514316350221634, "timestamp": "2025-09-04 03:46:35.059202", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:46:35.153126", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.021353455260396004, "timestamp": "2025-09-04 03:46:35.170412", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:46:35.280731", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.006292775738984346, "timestamp": "2025-09-04 03:46:35.301953", "step": 200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:46:43.677848", "step": 200, "epoch": 1 }, { "type": "pplx", "content": 308.7893902698641, "timestamp": "2025-09-04 03:46:43.680096", "step": 200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 200", "timestamp": "2025-09-04 03:46:44.033775", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:46:44.114791", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.13709776103496552, "timestamp": "2025-09-04 03:46:44.131611", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:44.234364", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.06784407794475555, "timestamp": "2025-09-04 03:46:44.253436", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:46:44.346048", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.023510560393333435, "timestamp": "2025-09-04 03:46:44.363111", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:44.440942", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.1330084502696991, "timestamp": "2025-09-04 03:46:44.455646", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:44.555190", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.02427174523472786, "timestamp": "2025-09-04 03:46:44.576158", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:44.654034", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.08518130332231522, "timestamp": "2025-09-04 03:46:44.667993", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:46:44.745924", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.028402844443917274, "timestamp": "2025-09-04 03:46:44.759722", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:44.862696", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.022264117375016212, "timestamp": "2025-09-04 03:46:44.882252", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:44.989666", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.014760280027985573, "timestamp": "2025-09-04 03:46:45.011616", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:45.120759", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.02477847971022129, "timestamp": "2025-09-04 03:46:45.140323", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:45.245385", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.09371557086706161, "timestamp": "2025-09-04 03:46:45.264286", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:45.368282", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.040149930864572525, "timestamp": "2025-09-04 03:46:45.387896", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:45.487708", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.05073068290948868, "timestamp": "2025-09-04 03:46:45.508705", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 03:46:45.724051", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.03666497766971588, "timestamp": "2025-09-04 03:46:45.762808", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:45.841828", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.025966059416532516, "timestamp": "2025-09-04 03:46:45.855469", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:45.957760", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.04828720539808273, "timestamp": "2025-09-04 03:46:45.976941", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:46.074755", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.008038034662604332, "timestamp": "2025-09-04 03:46:46.094767", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:46.198305", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.02476082369685173, "timestamp": "2025-09-04 03:46:46.217094", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:46.317114", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.03774585947394371, "timestamp": "2025-09-04 03:46:46.335244", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:46:46.435893", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.04572395235300064, "timestamp": "2025-09-04 03:46:46.454869", "step": 220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:46:54.891309", "step": 220, "epoch": 1 }, { "type": "pplx", "content": 311.0440274571144, "timestamp": "2025-09-04 03:46:54.893262", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:54.966356", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.020546726882457733, "timestamp": "2025-09-04 03:46:54.981421", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:55.059442", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.047001611441373825, "timestamp": "2025-09-04 03:46:55.073274", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:46:55.182713", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.04990854486823082, "timestamp": "2025-09-04 03:46:55.203243", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:46:55.297066", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.03975335881114006, "timestamp": "2025-09-04 03:46:55.314614", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:55.412711", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.037995487451553345, "timestamp": "2025-09-04 03:46:55.433055", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:55.535515", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.06138302758336067, "timestamp": "2025-09-04 03:46:55.554130", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 03:46:55.774170", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.04723235219717026, "timestamp": "2025-09-04 03:46:55.816481", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:46:55.920763", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.013455665670335293, "timestamp": "2025-09-04 03:46:55.940359", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:46:56.016328", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.05484043434262276, "timestamp": "2025-09-04 03:46:56.031474", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1200 ], "flops": 24000145761984.0 }, "timestamp": "2025-09-04 03:46:56.210233", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.011813382618129253, "timestamp": "2025-09-04 03:46:56.242851", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:56.338311", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.04934141784906387, "timestamp": "2025-09-04 03:46:56.355328", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:46:56.462670", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.01455477625131607, "timestamp": "2025-09-04 03:46:56.483049", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:46:56.589711", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.035196904093027115, "timestamp": "2025-09-04 03:46:56.611601", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:56.713067", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.1223868653178215, "timestamp": "2025-09-04 03:46:56.731468", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:56.836088", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.023488562554121017, "timestamp": "2025-09-04 03:46:56.855251", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:46:57.059604", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.041184525936841965, "timestamp": "2025-09-04 03:46:57.099406", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:46:57.192703", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.04467172920703888, "timestamp": "2025-09-04 03:46:57.211674", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:46:57.296253", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.058227408677339554, "timestamp": "2025-09-04 03:46:57.311130", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:46:57.414555", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.024533184245228767, "timestamp": "2025-09-04 03:46:57.433467", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:46:57.533042", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.03975701332092285, "timestamp": "2025-09-04 03:46:57.552325", "step": 240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:47:05.981740", "step": 240, "epoch": 1 }, { "type": "pplx", "content": 312.9286034141921, "timestamp": "2025-09-04 03:47:05.983838", "step": 240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 240", "timestamp": "2025-09-04 03:47:06.491752", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:06.593414", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.022641537711024284, "timestamp": "2025-09-04 03:47:06.614328", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:47:06.718158", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.026233583688735962, "timestamp": "2025-09-04 03:47:06.737180", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:47:06.859887", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.030290089547634125, "timestamp": "2025-09-04 03:47:06.882829", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:06.989864", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.009676921181380749, "timestamp": "2025-09-04 03:47:07.010467", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:47:07.118651", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.02796107716858387, "timestamp": "2025-09-04 03:47:07.141141", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:07.251743", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.06316182762384415, "timestamp": "2025-09-04 03:47:07.271865", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:47:07.349164", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.053157925605773926, "timestamp": "2025-09-04 03:47:07.362748", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:47:07.501150", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.1000499501824379, "timestamp": "2025-09-04 03:47:07.527516", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:47:07.603960", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.07537966966629028, "timestamp": "2025-09-04 03:47:07.618912", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:07.723085", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.04253721982240677, "timestamp": "2025-09-04 03:47:07.742175", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:07.843434", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.026042405515909195, "timestamp": "2025-09-04 03:47:07.862102", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:07.971729", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.0640282928943634, "timestamp": "2025-09-04 03:47:07.992076", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:08.094410", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.013043270446360111, "timestamp": "2025-09-04 03:47:08.115388", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:08.219527", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.02681521698832512, "timestamp": "2025-09-04 03:47:08.238620", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 03:47:08.416005", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.02740609645843506, "timestamp": "2025-09-04 03:47:08.448567", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:47:08.536920", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.06523073464632034, "timestamp": "2025-09-04 03:47:08.553113", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:08.645862", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.019541790708899498, "timestamp": "2025-09-04 03:47:08.664680", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:08.758789", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.02935168892145157, "timestamp": "2025-09-04 03:47:08.775713", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:08.870347", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.04195958375930786, "timestamp": "2025-09-04 03:47:08.887314", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:08.989716", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.013827347196638584, "timestamp": "2025-09-04 03:47:09.008620", "step": 260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:47:17.480249", "step": 260, "epoch": 1 }, { "type": "pplx", "content": 313.3745874189351, "timestamp": "2025-09-04 03:47:17.482337", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:17.585602", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.056827034801244736, "timestamp": "2025-09-04 03:47:17.607847", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:17.707055", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.04780471324920654, "timestamp": "2025-09-04 03:47:17.725413", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:17.824753", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.024920674040913582, "timestamp": "2025-09-04 03:47:17.843217", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:47:17.953478", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.016696227714419365, "timestamp": "2025-09-04 03:47:17.974500", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:18.065874", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.0172750111669302, "timestamp": "2025-09-04 03:47:18.084464", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:47:18.194133", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.0052177682518959045, "timestamp": "2025-09-04 03:47:18.214495", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:47:18.298475", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.03147877752780914, "timestamp": "2025-09-04 03:47:18.313335", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:47:18.416050", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.03740302100777626, "timestamp": "2025-09-04 03:47:18.435957", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:47:18.634340", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.08886415511369705, "timestamp": "2025-09-04 03:47:18.677352", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:47:18.795835", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.024287065491080284, "timestamp": "2025-09-04 03:47:18.818027", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:18.913367", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.05668797716498375, "timestamp": "2025-09-04 03:47:18.930814", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:19.039604", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.05908737704157829, "timestamp": "2025-09-04 03:47:19.060419", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:19.164051", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.010910317301750183, "timestamp": "2025-09-04 03:47:19.185981", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:47:19.325988", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.019638676196336746, "timestamp": "2025-09-04 03:47:19.351679", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:47:19.436571", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.07036858052015305, "timestamp": "2025-09-04 03:47:19.451806", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:47:19.546137", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.026476135477423668, "timestamp": "2025-09-04 03:47:19.563442", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:19.661217", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.0284750796854496, "timestamp": "2025-09-04 03:47:19.681378", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:47:19.765056", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.10833613574504852, "timestamp": "2025-09-04 03:47:19.780198", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:19.872507", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.030009621754288673, "timestamp": "2025-09-04 03:47:19.889386", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:47:19.962887", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.05326032638549805, "timestamp": "2025-09-04 03:47:19.976593", "step": 280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:47:28.367516", "step": 280, "epoch": 1 }, { "type": "pplx", "content": 312.74428797361605, "timestamp": "2025-09-04 03:47:28.370395", "step": 280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 280", "timestamp": "2025-09-04 03:47:28.839161", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:47:28.912750", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.03643646836280823, "timestamp": "2025-09-04 03:47:28.927377", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:47:29.006960", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.04280658811330795, "timestamp": "2025-09-04 03:47:29.020511", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:29.117476", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.01976653002202511, "timestamp": "2025-09-04 03:47:29.134794", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:29.230289", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.0358404815196991, "timestamp": "2025-09-04 03:47:29.247810", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:29.346237", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.029284300282597542, "timestamp": "2025-09-04 03:47:29.366289", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:47:29.485868", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.1099967360496521, "timestamp": "2025-09-04 03:47:29.507498", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:29.604126", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.023914791643619537, "timestamp": "2025-09-04 03:47:29.621110", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:47:29.760597", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.05426377058029175, "timestamp": "2025-09-04 03:47:29.787210", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:29.879326", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.050082337111234665, "timestamp": "2025-09-04 03:47:29.897720", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:29.994274", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.05403923988342285, "timestamp": "2025-09-04 03:47:30.011501", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:30.111748", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.047084759920835495, "timestamp": "2025-09-04 03:47:30.130634", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:47:30.210351", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.07940501719713211, "timestamp": "2025-09-04 03:47:30.224933", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:30.323714", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.09987480938434601, "timestamp": "2025-09-04 03:47:30.344265", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 03:47:30.419688", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.0861741453409195, "timestamp": "2025-09-04 03:47:30.432119", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:30.541849", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.03038984164595604, "timestamp": "2025-09-04 03:47:30.561788", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:30.663178", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.017293287441134453, "timestamp": "2025-09-04 03:47:30.682096", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:47:30.766176", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.0584249384701252, "timestamp": "2025-09-04 03:47:30.782879", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:30.878075", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.053005725145339966, "timestamp": "2025-09-04 03:47:30.895153", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:30.996825", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.015453525818884373, "timestamp": "2025-09-04 03:47:31.015301", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:47:31.125136", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.013833635486662388, "timestamp": "2025-09-04 03:47:31.145828", "step": 300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:47:39.537720", "step": 300, "epoch": 1 }, { "type": "pplx", "content": 311.2226619091774, "timestamp": "2025-09-04 03:47:39.539677", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:39.635130", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.005779569037258625, "timestamp": "2025-09-04 03:47:39.655327", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:47:39.731923", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.11063537001609802, "timestamp": "2025-09-04 03:47:39.745638", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:39.839960", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.02686300128698349, "timestamp": "2025-09-04 03:47:39.857135", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:39.963489", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.06041613593697548, "timestamp": "2025-09-04 03:47:39.983975", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:40.083742", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.01706847734749317, "timestamp": "2025-09-04 03:47:40.104803", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:40.210413", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.011196613311767578, "timestamp": "2025-09-04 03:47:40.230267", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:47:40.320630", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.07578642666339874, "timestamp": "2025-09-04 03:47:40.337139", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:40.430408", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.04545224457979202, "timestamp": "2025-09-04 03:47:40.448605", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:40.549272", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.041280921548604965, "timestamp": "2025-09-04 03:47:40.570265", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:47:40.668868", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.05615556985139847, "timestamp": "2025-09-04 03:47:40.682155", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:47:40.785194", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.07136084884405136, "timestamp": "2025-09-04 03:47:40.804167", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:40.914245", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.019209880381822586, "timestamp": "2025-09-04 03:47:40.933587", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:47:41.019153", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.10823129117488861, "timestamp": "2025-09-04 03:47:41.036119", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:41.135194", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.03277946263551712, "timestamp": "2025-09-04 03:47:41.153529", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:47:41.237996", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.06067274883389473, "timestamp": "2025-09-04 03:47:41.253446", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:47:41.381637", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.01747446320950985, "timestamp": "2025-09-04 03:47:41.405328", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:41.496751", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.024807121604681015, "timestamp": "2025-09-04 03:47:41.515635", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:41.620768", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.008701799437403679, "timestamp": "2025-09-04 03:47:41.640483", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:47:41.746608", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.03150641545653343, "timestamp": "2025-09-04 03:47:41.766485", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:47:41.902464", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.04005073383450508, "timestamp": "2025-09-04 03:47:41.929101", "step": 320, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:47:50.311558", "step": 320, "epoch": 1 }, { "type": "pplx", "content": 309.1398029399646, "timestamp": "2025-09-04 03:47:50.313320", "step": 320, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 320", "timestamp": "2025-09-04 03:47:50.768107", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:47:50.886087", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.024150997400283813, "timestamp": "2025-09-04 03:47:50.911098", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:51.005388", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.013538923114538193, "timestamp": "2025-09-04 03:47:51.022833", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:47:51.158175", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.03433951735496521, "timestamp": "2025-09-04 03:47:51.183772", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:47:51.282848", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.04349237680435181, "timestamp": "2025-09-04 03:47:51.301876", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:47:51.408869", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.03356247767806053, "timestamp": "2025-09-04 03:47:51.431173", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:47:51.534373", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.04008670523762703, "timestamp": "2025-09-04 03:47:51.553175", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:47:51.629153", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.007040772121399641, "timestamp": "2025-09-04 03:47:51.642842", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:51.735567", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.03876568377017975, "timestamp": "2025-09-04 03:47:51.753448", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:47:51.844898", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.03228950873017311, "timestamp": "2025-09-04 03:47:51.863955", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:47:51.958554", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.09584397077560425, "timestamp": "2025-09-04 03:47:51.975592", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:52.076487", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.05004340410232544, "timestamp": "2025-09-04 03:47:52.095272", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:47:52.173583", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.03638507053256035, "timestamp": "2025-09-04 03:47:52.188303", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:47:52.280377", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.05368280038237572, "timestamp": "2025-09-04 03:47:52.299280", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:47:52.376428", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.05048099532723427, "timestamp": "2025-09-04 03:47:52.390448", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:47:52.494290", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.0237380713224411, "timestamp": "2025-09-04 03:47:52.513321", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:47:52.616448", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.014010374434292316, "timestamp": "2025-09-04 03:47:52.636162", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:47:52.724698", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.052086420357227325, "timestamp": "2025-09-04 03:47:52.743063", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:47:52.854009", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.04896777868270874, "timestamp": "2025-09-04 03:47:52.874728", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:47:52.974955", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.06648825109004974, "timestamp": "2025-09-04 03:47:52.993821", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:47:53.081148", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.09628743678331375, "timestamp": "2025-09-04 03:47:53.097474", "step": 340, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:02.050472", "step": 340, "epoch": 1 }, { "type": "pplx", "content": 307.83904285220194, "timestamp": "2025-09-04 03:48:02.053159", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:48:02.127903", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.027161644771695137, "timestamp": "2025-09-04 03:48:02.142965", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:48:02.218754", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.0412270650267601, "timestamp": "2025-09-04 03:48:02.232083", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:48:02.339531", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.058129601180553436, "timestamp": "2025-09-04 03:48:02.359457", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:02.470920", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.04298894852399826, "timestamp": "2025-09-04 03:48:02.492120", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:02.591173", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.04139872267842293, "timestamp": "2025-09-04 03:48:02.611744", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:48:02.748595", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.036456745117902756, "timestamp": "2025-09-04 03:48:02.774427", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:02.879028", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.09559651464223862, "timestamp": "2025-09-04 03:48:02.898191", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:48:02.971130", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.036566995084285736, "timestamp": "2025-09-04 03:48:02.984752", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:03.082772", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.04065980389714241, "timestamp": "2025-09-04 03:48:03.103069", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:03.208487", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.05061135068535805, "timestamp": "2025-09-04 03:48:03.227437", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:48:03.303490", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.016793936491012573, "timestamp": "2025-09-04 03:48:03.317128", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:48:03.401892", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.02601003088057041, "timestamp": "2025-09-04 03:48:03.417643", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:03.519469", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.02657604031264782, "timestamp": "2025-09-04 03:48:03.540081", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:48:03.627443", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.02620949223637581, "timestamp": "2025-09-04 03:48:03.643009", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:48:03.729847", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.05669962242245674, "timestamp": "2025-09-04 03:48:03.745335", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:03.847153", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.028145290911197662, "timestamp": "2025-09-04 03:48:03.866641", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:48:03.986575", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.0207599475979805, "timestamp": "2025-09-04 03:48:04.011984", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:48:04.091711", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.030706727877259254, "timestamp": "2025-09-04 03:48:04.105863", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:48:04.212114", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.010737722739577293, "timestamp": "2025-09-04 03:48:04.232012", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 03:48:04.362847", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.029181912541389465, "timestamp": "2025-09-04 03:48:04.387540", "step": 360, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:12.919636", "step": 360, "epoch": 1 }, { "type": "pplx", "content": 307.5586884862369, "timestamp": "2025-09-04 03:48:12.935018", "step": 360, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 360", "timestamp": "2025-09-04 03:48:13.409472", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:48:13.515183", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.03834038972854614, "timestamp": "2025-09-04 03:48:13.529742", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:48:13.687701", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.025468185544013977, "timestamp": "2025-09-04 03:48:13.706710", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:13.836524", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.0641966164112091, "timestamp": "2025-09-04 03:48:13.855536", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:48:13.996064", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.03254721313714981, "timestamp": "2025-09-04 03:48:14.017115", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:48:14.164508", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.14557507634162903, "timestamp": "2025-09-04 03:48:14.186890", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:14.293921", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.03721340000629425, "timestamp": "2025-09-04 03:48:14.311196", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:48:14.437001", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.011114726774394512, "timestamp": "2025-09-04 03:48:14.457608", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:48:14.600803", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.0390244759619236, "timestamp": "2025-09-04 03:48:14.618979", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:48:14.746624", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.09442558139562607, "timestamp": "2025-09-04 03:48:14.767824", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:14.883101", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.07992935180664062, "timestamp": "2025-09-04 03:48:14.901810", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:15.024280", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.05484499782323837, "timestamp": "2025-09-04 03:48:15.042914", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:48:15.147914", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.06619900465011597, "timestamp": "2025-09-04 03:48:15.164438", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:48:15.250873", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.03658528998494148, "timestamp": "2025-09-04 03:48:15.303114", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:15.433580", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.022451380267739296, "timestamp": "2025-09-04 03:48:15.452866", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 03:48:15.615676", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.05221700668334961, "timestamp": "2025-09-04 03:48:15.639715", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1040 ], "flops": 20800126336064.0 }, "timestamp": "2025-09-04 03:48:15.819023", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.09046898037195206, "timestamp": "2025-09-04 03:48:15.849378", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:48:15.998775", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.07325262576341629, "timestamp": "2025-09-04 03:48:16.023460", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:48:16.119471", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.042737096548080444, "timestamp": "2025-09-04 03:48:16.133559", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:16.283766", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.02026101015508175, "timestamp": "2025-09-04 03:48:16.304404", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:16.427741", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.011748573742806911, "timestamp": "2025-09-04 03:48:16.445240", "step": 380, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:25.287729", "step": 380, "epoch": 1 }, { "type": "pplx", "content": 311.8221380685682, "timestamp": "2025-09-04 03:48:25.295794", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:25.392305", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.013346564956009388, "timestamp": "2025-09-04 03:48:25.410543", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:25.535167", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.013797925785183907, "timestamp": "2025-09-04 03:48:25.555859", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:25.681692", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.015700260177254677, "timestamp": "2025-09-04 03:48:25.701141", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:25.835149", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.024386491626501083, "timestamp": "2025-09-04 03:48:25.859596", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:25.987712", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.016429277136921883, "timestamp": "2025-09-04 03:48:26.006472", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:48:26.122316", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.011799799278378487, "timestamp": "2025-09-04 03:48:26.141876", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:26.248691", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.06375425308942795, "timestamp": "2025-09-04 03:48:26.265309", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:48:26.346698", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.035253461450338364, "timestamp": "2025-09-04 03:48:26.361086", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:26.473191", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.05822211131453514, "timestamp": "2025-09-04 03:48:26.494275", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:48:26.575019", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.035860493779182434, "timestamp": "2025-09-04 03:48:26.588671", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:48:26.674728", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.025279760360717773, "timestamp": "2025-09-04 03:48:26.688319", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:48:26.776869", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.022274592891335487, "timestamp": "2025-09-04 03:48:26.792909", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:26.892516", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.026588624343276024, "timestamp": "2025-09-04 03:48:26.912495", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:27.007916", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.0037299662362784147, "timestamp": "2025-09-04 03:48:27.024992", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:27.130031", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.016043562442064285, "timestamp": "2025-09-04 03:48:27.149089", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:48:27.261367", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.022281493991613388, "timestamp": "2025-09-04 03:48:27.282593", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:27.381479", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.019878646358847618, "timestamp": "2025-09-04 03:48:27.401844", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:27.493393", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.06282302737236023, "timestamp": "2025-09-04 03:48:27.510053", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:27.627066", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.016571981832385063, "timestamp": "2025-09-04 03:48:27.646198", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:27.748235", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.05791422352194786, "timestamp": "2025-09-04 03:48:27.767514", "step": 400, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:36.321805", "step": 400, "epoch": 1 }, { "type": "pplx", "content": 316.7635327191992, "timestamp": "2025-09-04 03:48:36.324054", "step": 400, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 400", "timestamp": "2025-09-04 03:48:36.675112", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:48:36.774708", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.022503888234496117, "timestamp": "2025-09-04 03:48:36.795589", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:36.889466", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.021656574681401253, "timestamp": "2025-09-04 03:48:36.906649", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:48:36.992132", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.036612629890441895, "timestamp": "2025-09-04 03:48:37.007536", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:48:37.081308", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.015442884527146816, "timestamp": "2025-09-04 03:48:37.094723", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:37.185279", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.02640901878476143, "timestamp": "2025-09-04 03:48:37.203874", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:48:37.290602", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.05808902904391289, "timestamp": "2025-09-04 03:48:37.305715", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:37.401155", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.02911917120218277, "timestamp": "2025-09-04 03:48:37.418222", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:48:37.494702", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.012547432444989681, "timestamp": "2025-09-04 03:48:37.508723", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:37.615037", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.029442116618156433, "timestamp": "2025-09-04 03:48:37.637295", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:48:37.743694", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.017742721363902092, "timestamp": "2025-09-04 03:48:37.763621", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:37.854723", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.13883210718631744, "timestamp": "2025-09-04 03:48:37.871242", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:37.964918", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.06384449452161789, "timestamp": "2025-09-04 03:48:37.982828", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:48:38.096961", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.021444780752062798, "timestamp": "2025-09-04 03:48:38.120984", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:38.213825", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.023383496329188347, "timestamp": "2025-09-04 03:48:38.230925", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:38.334573", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.011450543999671936, "timestamp": "2025-09-04 03:48:38.353477", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:48:38.430473", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.03750946745276451, "timestamp": "2025-09-04 03:48:38.445210", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:38.543176", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.011969326063990593, "timestamp": "2025-09-04 03:48:38.563556", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:48:38.639745", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.07576386630535126, "timestamp": "2025-09-04 03:48:38.653215", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:48:38.787749", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.009221615269780159, "timestamp": "2025-09-04 03:48:38.813311", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:38.913752", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.020009953528642654, "timestamp": "2025-09-04 03:48:38.932761", "step": 420, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:47.342665", "step": 420, "epoch": 1 }, { "type": "pplx", "content": 319.4097698572594, "timestamp": "2025-09-04 03:48:47.344661", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:47.449253", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.053698133677244186, "timestamp": "2025-09-04 03:48:47.471467", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:47.581639", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.009336840361356735, "timestamp": "2025-09-04 03:48:47.602046", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:48:47.722268", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.017453299835324287, "timestamp": "2025-09-04 03:48:47.744067", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:48:47.827080", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.03400561586022377, "timestamp": "2025-09-04 03:48:47.842904", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:48:47.936119", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.012152721174061298, "timestamp": "2025-09-04 03:48:47.955176", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:48.055267", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.052000127732753754, "timestamp": "2025-09-04 03:48:48.073372", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:48.167311", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.01812596619129181, "timestamp": "2025-09-04 03:48:48.184092", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:48:48.295139", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.038504090160131454, "timestamp": "2025-09-04 03:48:48.316458", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:48.416934", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.07306548207998276, "timestamp": "2025-09-04 03:48:48.438023", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:48:48.533780", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.08940979093313217, "timestamp": "2025-09-04 03:48:48.551198", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:48.647281", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.01325355563312769, "timestamp": "2025-09-04 03:48:48.664338", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:48:48.759489", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.046491898596286774, "timestamp": "2025-09-04 03:48:48.777517", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:48:48.883612", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.12319537252187729, "timestamp": "2025-09-04 03:48:48.905497", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:49.016164", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.0106568094342947, "timestamp": "2025-09-04 03:48:49.036142", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:48:49.136703", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.009872270748019218, "timestamp": "2025-09-04 03:48:49.155241", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:49.256981", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.016583112999796867, "timestamp": "2025-09-04 03:48:49.275694", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:48:49.360494", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.008935445919632912, "timestamp": "2025-09-04 03:48:49.376832", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:49.468735", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.03650280088186264, "timestamp": "2025-09-04 03:48:49.485004", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:49.581069", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.017082802951335907, "timestamp": "2025-09-04 03:48:49.598116", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:49.702688", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.007683966308832169, "timestamp": "2025-09-04 03:48:49.722439", "step": 440, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:48:58.199774", "step": 440, "epoch": 1 }, { "type": "pplx", "content": 321.40916814073927, "timestamp": "2025-09-04 03:48:58.202230", "step": 440, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 440", "timestamp": "2025-09-04 03:48:58.565856", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:48:58.664276", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.023604106158018112, "timestamp": "2025-09-04 03:48:58.684199", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 03:48:58.904564", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.012493900023400784, "timestamp": "2025-09-04 03:48:58.946767", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:48:59.054947", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.027849415317177773, "timestamp": "2025-09-04 03:48:59.074061", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:48:59.168688", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.027136214077472687, "timestamp": "2025-09-04 03:48:59.186008", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:48:59.260823", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.06847383081912994, "timestamp": "2025-09-04 03:48:59.275370", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:48:59.387276", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.006238785106688738, "timestamp": "2025-09-04 03:48:59.407172", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:48:59.495549", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.060996416956186295, "timestamp": "2025-09-04 03:48:59.510441", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:48:59.602462", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.04064783453941345, "timestamp": "2025-09-04 03:48:59.619374", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:48:59.724508", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.08736442029476166, "timestamp": "2025-09-04 03:48:59.745799", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:48:59.850860", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.0009415106615051627, "timestamp": "2025-09-04 03:48:59.869303", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:48:59.964786", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.043740540742874146, "timestamp": "2025-09-04 03:48:59.981465", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:00.076418", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.015802182257175446, "timestamp": "2025-09-04 03:49:00.093653", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:00.195759", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.07930180430412292, "timestamp": "2025-09-04 03:49:00.216444", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:49:00.312721", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.01712995208799839, "timestamp": "2025-09-04 03:49:00.329671", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:49:00.439710", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.06034353747963905, "timestamp": "2025-09-04 03:49:00.460043", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:00.552284", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.008295533247292042, "timestamp": "2025-09-04 03:49:00.569255", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:49:00.653046", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.019054660573601723, "timestamp": "2025-09-04 03:49:00.669066", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:49:00.767740", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.01991303637623787, "timestamp": "2025-09-04 03:49:00.784981", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:00.886589", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.033343605697155, "timestamp": "2025-09-04 03:49:00.904779", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:01.005430", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.07760113477706909, "timestamp": "2025-09-04 03:49:01.024576", "step": 460, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:49:09.462043", "step": 460, "epoch": 1 }, { "type": "pplx", "content": 324.06946329758586, "timestamp": "2025-09-04 03:49:09.463792", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:49:09.541744", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.04705421254038811, "timestamp": "2025-09-04 03:49:09.558323", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:09.663299", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.004760258831083775, "timestamp": "2025-09-04 03:49:09.683378", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:09.775625", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.02817857638001442, "timestamp": "2025-09-04 03:49:09.792725", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:09.895742", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.04996614158153534, "timestamp": "2025-09-04 03:49:09.915742", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:10.013902", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.03526470810174942, "timestamp": "2025-09-04 03:49:10.034308", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:49:10.118179", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.061329569667577744, "timestamp": "2025-09-04 03:49:10.133268", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:49:10.335498", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.013789204880595207, "timestamp": "2025-09-04 03:49:10.374743", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:49:10.485354", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.049680087715387344, "timestamp": "2025-09-04 03:49:10.506771", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:49:10.581897", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.05645358934998512, "timestamp": "2025-09-04 03:49:10.597034", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:49:10.706775", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.01656719297170639, "timestamp": "2025-09-04 03:49:10.727119", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:49:10.805413", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.01675380952656269, "timestamp": "2025-09-04 03:49:10.819234", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:49:10.891804", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.005480926018208265, "timestamp": "2025-09-04 03:49:10.905273", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:49:11.023420", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.037396758794784546, "timestamp": "2025-09-04 03:49:11.048758", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:49:11.126774", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.029778504744172096, "timestamp": "2025-09-04 03:49:11.140580", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 03:49:11.277106", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.0628209188580513, "timestamp": "2025-09-04 03:49:11.301517", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:11.400955", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.02091328613460064, "timestamp": "2025-09-04 03:49:11.420085", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:11.510953", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.060857050120830536, "timestamp": "2025-09-04 03:49:11.529984", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:11.623012", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.03812684863805771, "timestamp": "2025-09-04 03:49:11.639932", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:49:11.725628", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.09244571626186371, "timestamp": "2025-09-04 03:49:11.741008", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:11.833497", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.05786127969622612, "timestamp": "2025-09-04 03:49:11.851190", "step": 480, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:49:20.202978", "step": 480, "epoch": 1 }, { "type": "pplx", "content": 328.5239349757793, "timestamp": "2025-09-04 03:49:20.204938", "step": 480, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 480", "timestamp": "2025-09-04 03:49:20.553378", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:49:20.657173", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.047159433364868164, "timestamp": "2025-09-04 03:49:20.679232", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:20.773246", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.07475479692220688, "timestamp": "2025-09-04 03:49:20.790699", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:20.893431", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.012311739847064018, "timestamp": "2025-09-04 03:49:20.912687", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:49:21.022565", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.011123532429337502, "timestamp": "2025-09-04 03:49:21.043679", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:49:21.148538", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.06513547897338867, "timestamp": "2025-09-04 03:49:21.170605", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:21.276579", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.019697437062859535, "timestamp": "2025-09-04 03:49:21.296331", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:21.403122", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.06598882377147675, "timestamp": "2025-09-04 03:49:21.422840", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:21.529193", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.0512818843126297, "timestamp": "2025-09-04 03:49:21.549823", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:49:21.624011", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.021413128823041916, "timestamp": "2025-09-04 03:49:21.638832", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:49:21.723109", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.018553882837295532, "timestamp": "2025-09-04 03:49:21.738095", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:49:21.839249", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.11178267747163773, "timestamp": "2025-09-04 03:49:21.856511", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:49:21.933185", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.07693499326705933, "timestamp": "2025-09-04 03:49:21.947886", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:22.035794", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.03976715728640556, "timestamp": "2025-09-04 03:49:22.053973", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:49:22.137152", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.025103233754634857, "timestamp": "2025-09-04 03:49:22.152121", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:49:22.259839", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.011973893269896507, "timestamp": "2025-09-04 03:49:22.279867", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:22.378194", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.08681105077266693, "timestamp": "2025-09-04 03:49:22.397335", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:49:22.471677", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.0133007001131773, "timestamp": "2025-09-04 03:49:22.486802", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:49:22.562649", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.05417775362730026, "timestamp": "2025-09-04 03:49:22.576169", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:49:22.652206", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.04143834114074707, "timestamp": "2025-09-04 03:49:22.665755", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:49:22.749786", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.021494613960385323, "timestamp": "2025-09-04 03:49:22.765498", "step": 500, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:49:31.153962", "step": 500, "epoch": 1 }, { "type": "pplx", "content": 332.01159701525927, "timestamp": "2025-09-04 03:49:31.156242", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:49:31.271328", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.057877492159605026, "timestamp": "2025-09-04 03:49:31.295176", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:31.398872", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.023877454921603203, "timestamp": "2025-09-04 03:49:31.418005", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:31.518096", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.017220618203282356, "timestamp": "2025-09-04 03:49:31.536414", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:49:31.647385", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.03338051587343216, "timestamp": "2025-09-04 03:49:31.668588", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:31.760724", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.008282708935439587, "timestamp": "2025-09-04 03:49:31.779547", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:49:31.854732", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.027649713680148125, "timestamp": "2025-09-04 03:49:31.868026", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:49:31.978236", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.07461496442556381, "timestamp": "2025-09-04 03:49:31.998627", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:32.101885", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.034678079187870026, "timestamp": "2025-09-04 03:49:32.121824", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:32.225900", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.017775679007172585, "timestamp": "2025-09-04 03:49:32.247614", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:49:32.324639", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.021030427888035774, "timestamp": "2025-09-04 03:49:32.337950", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:32.431041", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.0012499457225203514, "timestamp": "2025-09-04 03:49:32.448340", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:32.550823", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.05383225530385971, "timestamp": "2025-09-04 03:49:32.570216", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:32.673612", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.09943293035030365, "timestamp": "2025-09-04 03:49:32.695354", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:49:32.804728", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.067501500248909, "timestamp": "2025-09-04 03:49:32.825120", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:49:32.897035", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.056289635598659515, "timestamp": "2025-09-04 03:49:32.909856", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:33.004917", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.020486541092395782, "timestamp": "2025-09-04 03:49:33.022917", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:33.113978", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.09998507797718048, "timestamp": "2025-09-04 03:49:33.132916", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:49:33.234964", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.018785972148180008, "timestamp": "2025-09-04 03:49:33.254022", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:33.346763", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.05426869913935661, "timestamp": "2025-09-04 03:49:33.363701", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:49:33.458619", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.0028427981305867434, "timestamp": "2025-09-04 03:49:33.476697", "step": 520, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:49:41.838088", "step": 520, "epoch": 1 }, { "type": "pplx", "content": 331.47364260793984, "timestamp": "2025-09-04 03:49:41.839937", "step": 520, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 520", "timestamp": "2025-09-04 03:49:42.197749", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:49:42.304263", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.004516193643212318, "timestamp": "2025-09-04 03:49:42.326503", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:42.420198", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.029628755524754524, "timestamp": "2025-09-04 03:49:42.437520", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:42.537022", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.016332386061549187, "timestamp": "2025-09-04 03:49:42.555319", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:49:42.657450", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.12754516303539276, "timestamp": "2025-09-04 03:49:42.677064", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:42.767235", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.0154288774356246, "timestamp": "2025-09-04 03:49:42.785875", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:42.888440", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.04728075489401817, "timestamp": "2025-09-04 03:49:42.907264", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 03:49:42.974277", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.039872948080301285, "timestamp": "2025-09-04 03:49:42.985085", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:49:43.103378", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.15819469094276428, "timestamp": "2025-09-04 03:49:43.126041", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:43.214657", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.00868944637477398, "timestamp": "2025-09-04 03:49:43.232835", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:49:43.310728", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.043173011392354965, "timestamp": "2025-09-04 03:49:43.324708", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:43.424559", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.009749419055879116, "timestamp": "2025-09-04 03:49:43.443157", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:43.542929", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.060370367020368576, "timestamp": "2025-09-04 03:49:43.562461", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:49:43.646414", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.05955101177096367, "timestamp": "2025-09-04 03:49:43.663498", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:49:43.769394", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.011455871164798737, "timestamp": "2025-09-04 03:49:43.789213", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:43.883263", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.017603786662220955, "timestamp": "2025-09-04 03:49:43.900564", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:43.999470", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.004019308835268021, "timestamp": "2025-09-04 03:49:44.018774", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:44.108934", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.03117678314447403, "timestamp": "2025-09-04 03:49:44.127529", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:49:44.228887", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.02788373827934265, "timestamp": "2025-09-04 03:49:44.247823", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:49:44.323710", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.023979298770427704, "timestamp": "2025-09-04 03:49:44.337235", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:44.427420", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.0515667125582695, "timestamp": "2025-09-04 03:49:44.444749", "step": 540, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:49:52.800904", "step": 540, "epoch": 1 }, { "type": "pplx", "content": 328.1148207360572, "timestamp": "2025-09-04 03:49:52.802715", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:49:52.884845", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.034760650247335434, "timestamp": "2025-09-04 03:49:52.901766", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:49:53.030246", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.061195723712444305, "timestamp": "2025-09-04 03:49:53.053121", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:53.147174", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.04390040412545204, "timestamp": "2025-09-04 03:49:53.164465", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:49:53.258655", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.01793227717280388, "timestamp": "2025-09-04 03:49:53.276613", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:49:53.414366", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.0374876894056797, "timestamp": "2025-09-04 03:49:53.442685", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:53.543275", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.08235947042703629, "timestamp": "2025-09-04 03:49:53.561611", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:53.656351", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.04841390252113342, "timestamp": "2025-09-04 03:49:53.673261", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:49:53.781485", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.01317357737571001, "timestamp": "2025-09-04 03:49:53.802283", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:53.893276", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.016209116205573082, "timestamp": "2025-09-04 03:49:53.911823", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:49:54.016291", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.08481376618146896, "timestamp": "2025-09-04 03:49:54.035369", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:49:54.144944", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.045819301158189774, "timestamp": "2025-09-04 03:49:54.165459", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:54.256078", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.02893437258899212, "timestamp": "2025-09-04 03:49:54.273632", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:49:54.393325", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.0218905508518219, "timestamp": "2025-09-04 03:49:54.418542", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:54.511444", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.03285536542534828, "timestamp": "2025-09-04 03:49:54.528330", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:49:54.630178", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.008444556035101414, "timestamp": "2025-09-04 03:49:54.649073", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:49:54.740380", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.04841528460383415, "timestamp": "2025-09-04 03:49:54.757766", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:49:54.848997", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.02788812294602394, "timestamp": "2025-09-04 03:49:54.867556", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:49:54.942951", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.03387841582298279, "timestamp": "2025-09-04 03:49:54.956225", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:49:55.056086", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.038461048156023026, "timestamp": "2025-09-04 03:49:55.074363", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:49:55.161963", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.025093531236052513, "timestamp": "2025-09-04 03:49:55.177890", "step": 560, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:03.607868", "step": 560, "epoch": 1 }, { "type": "pplx", "content": 323.1010053666772, "timestamp": "2025-09-04 03:50:03.610489", "step": 560, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 560", "timestamp": "2025-09-04 03:50:04.111690", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:50:04.194180", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.02403130754828453, "timestamp": "2025-09-04 03:50:04.210903", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:04.300277", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.010109679773449898, "timestamp": "2025-09-04 03:50:04.316739", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:04.418541", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.053742896765470505, "timestamp": "2025-09-04 03:50:04.436868", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:50:04.547209", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.025198856368660927, "timestamp": "2025-09-04 03:50:04.568436", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:04.655610", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.04810674116015434, "timestamp": "2025-09-04 03:50:04.673889", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:50:04.782853", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.01729135774075985, "timestamp": "2025-09-04 03:50:04.802978", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:04.912535", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.03990844264626503, "timestamp": "2025-09-04 03:50:04.932830", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:50:05.016165", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.018527284264564514, "timestamp": "2025-09-04 03:50:05.031879", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:05.123611", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.04572749882936478, "timestamp": "2025-09-04 03:50:05.142464", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:05.251316", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.06394918262958527, "timestamp": "2025-09-04 03:50:05.271592", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:05.365527", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.023581665009260178, "timestamp": "2025-09-04 03:50:05.382808", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:05.485680", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.026009058579802513, "timestamp": "2025-09-04 03:50:05.505623", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:50:05.612207", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.011551128700375557, "timestamp": "2025-09-04 03:50:05.634936", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:05.737394", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.0341804064810276, "timestamp": "2025-09-04 03:50:05.756726", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:50:05.849571", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.09191818535327911, "timestamp": "2025-09-04 03:50:05.866319", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:05.966270", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.008478672243654728, "timestamp": "2025-09-04 03:50:05.985773", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:06.086160", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.10518831759691238, "timestamp": "2025-09-04 03:50:06.107230", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:06.205533", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.0350443534553051, "timestamp": "2025-09-04 03:50:06.223903", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:50:06.295644", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.03610123321413994, "timestamp": "2025-09-04 03:50:06.308378", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:06.402413", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.07018542289733887, "timestamp": "2025-09-04 03:50:06.420355", "step": 580, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:14.881907", "step": 580, "epoch": 1 }, { "type": "pplx", "content": 323.03369944740126, "timestamp": "2025-09-04 03:50:14.884156", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:14.980302", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.009829767979681492, "timestamp": "2025-09-04 03:50:15.000981", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 03:50:15.073314", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.01034059002995491, "timestamp": "2025-09-04 03:50:15.085969", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:15.182867", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.023535916581749916, "timestamp": "2025-09-04 03:50:15.200207", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:50:15.306387", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.048321448266506195, "timestamp": "2025-09-04 03:50:15.326896", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:15.426730", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.007726522162556648, "timestamp": "2025-09-04 03:50:15.447627", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:50:15.525155", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.10942022502422333, "timestamp": "2025-09-04 03:50:15.538987", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:50:15.645303", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.00408227788284421, "timestamp": "2025-09-04 03:50:15.665165", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:15.767654", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.013964972458779812, "timestamp": "2025-09-04 03:50:15.787665", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:15.894691", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.061664290726184845, "timestamp": "2025-09-04 03:50:15.916999", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:50:16.000197", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.08124200999736786, "timestamp": "2025-09-04 03:50:16.015088", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:50:16.098629", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.012426053173840046, "timestamp": "2025-09-04 03:50:16.113698", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:16.213709", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.11045590788125992, "timestamp": "2025-09-04 03:50:16.233343", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:16.333263", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.03249969705939293, "timestamp": "2025-09-04 03:50:16.354179", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:50:16.464080", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.023777302354574203, "timestamp": "2025-09-04 03:50:16.484351", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:16.593894", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.017426855862140656, "timestamp": "2025-09-04 03:50:16.614358", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:16.715910", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.032828718423843384, "timestamp": "2025-09-04 03:50:16.735355", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:16.826539", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.08800698816776276, "timestamp": "2025-09-04 03:50:16.845406", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:16.939524", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.05674955993890762, "timestamp": "2025-09-04 03:50:16.956622", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:17.057539", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.02611662819981575, "timestamp": "2025-09-04 03:50:17.076255", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:50:17.154424", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.037351664155721664, "timestamp": "2025-09-04 03:50:17.169108", "step": 600, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:25.545854", "step": 600, "epoch": 1 }, { "type": "pplx", "content": 324.7015294662687, "timestamp": "2025-09-04 03:50:25.548258", "step": 600, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 600", "timestamp": "2025-09-04 03:50:25.891967", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:50:25.963785", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.05272102355957031, "timestamp": "2025-09-04 03:50:25.978431", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:26.080090", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.013583468273282051, "timestamp": "2025-09-04 03:50:26.099071", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:26.197206", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.021021874621510506, "timestamp": "2025-09-04 03:50:26.215799", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:26.310990", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.012546015903353691, "timestamp": "2025-09-04 03:50:26.329269", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:26.429291", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.04589983820915222, "timestamp": "2025-09-04 03:50:26.450420", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:26.545770", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.04309983178973198, "timestamp": "2025-09-04 03:50:26.563268", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:50:26.648264", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.06627572327852249, "timestamp": "2025-09-04 03:50:26.663786", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:26.758202", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.02102210372686386, "timestamp": "2025-09-04 03:50:26.776450", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:50:26.892497", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.04680265486240387, "timestamp": "2025-09-04 03:50:26.916116", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:27.019321", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.05080176889896393, "timestamp": "2025-09-04 03:50:27.038659", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:27.138655", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.02795407734811306, "timestamp": "2025-09-04 03:50:27.157004", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:50:27.231965", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.025690896436572075, "timestamp": "2025-09-04 03:50:27.246524", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:50:27.326510", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.011252271011471748, "timestamp": "2025-09-04 03:50:27.343040", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:50:27.426099", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.029573671519756317, "timestamp": "2025-09-04 03:50:27.440961", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:27.542987", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.03328055888414383, "timestamp": "2025-09-04 03:50:27.562129", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:27.656249", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.03198854997754097, "timestamp": "2025-09-04 03:50:27.673816", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:27.775050", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.05293620377779007, "timestamp": "2025-09-04 03:50:27.795724", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:50:27.918294", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.030497848987579346, "timestamp": "2025-09-04 03:50:27.941330", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:28.055770", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.003063701558858156, "timestamp": "2025-09-04 03:50:28.074792", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:50:28.196976", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.00672591058537364, "timestamp": "2025-09-04 03:50:28.220908", "step": 620, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:36.596005", "step": 620, "epoch": 1 }, { "type": "pplx", "content": 330.22229839916525, "timestamp": "2025-09-04 03:50:36.598510", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:50:36.680013", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.07592124491930008, "timestamp": "2025-09-04 03:50:36.697115", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:36.797854", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.03790687769651413, "timestamp": "2025-09-04 03:50:36.816637", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:50:36.894069", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.05864779278635979, "timestamp": "2025-09-04 03:50:36.907933", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:50:37.000229", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.04656538739800453, "timestamp": "2025-09-04 03:50:37.017840", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:37.106987", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.08541495352983475, "timestamp": "2025-09-04 03:50:37.125351", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:37.227295", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.019130367785692215, "timestamp": "2025-09-04 03:50:37.245938", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:37.339064", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.030174510553479195, "timestamp": "2025-09-04 03:50:37.356225", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:37.451088", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.012393724173307419, "timestamp": "2025-09-04 03:50:37.469176", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:37.557289", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.03318622708320618, "timestamp": "2025-09-04 03:50:37.575472", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:50:37.652296", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.041939180344343185, "timestamp": "2025-09-04 03:50:37.666329", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:37.759762", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.01324822474271059, "timestamp": "2025-09-04 03:50:37.777018", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:50:37.855647", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.011515927501022816, "timestamp": "2025-09-04 03:50:37.870228", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:50:37.960942", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.03529660031199455, "timestamp": "2025-09-04 03:50:37.979640", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:38.081658", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.038785651326179504, "timestamp": "2025-09-04 03:50:38.100818", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 03:50:38.239722", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.014862718991935253, "timestamp": "2025-09-04 03:50:38.265727", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:38.366128", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.054356202483177185, "timestamp": "2025-09-04 03:50:38.385549", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:38.485770", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.12962757050991058, "timestamp": "2025-09-04 03:50:38.506762", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:38.617492", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.04526481032371521, "timestamp": "2025-09-04 03:50:38.635828", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:38.739668", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.022485429421067238, "timestamp": "2025-09-04 03:50:38.758702", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:38.861201", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.01773577369749546, "timestamp": "2025-09-04 03:50:38.880872", "step": 640, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:47.264720", "step": 640, "epoch": 1 }, { "type": "pplx", "content": 331.5543720069065, "timestamp": "2025-09-04 03:50:47.267008", "step": 640, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 640", "timestamp": "2025-09-04 03:50:47.629883", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 03:50:47.702312", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.018003080040216446, "timestamp": "2025-09-04 03:50:47.714561", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:50:47.791539", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.01093088649213314, "timestamp": "2025-09-04 03:50:47.805588", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:47.905150", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.02146296203136444, "timestamp": "2025-09-04 03:50:47.923461", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:48.024088", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.04993215575814247, "timestamp": "2025-09-04 03:50:48.043552", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:50:48.143506", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.08081956207752228, "timestamp": "2025-09-04 03:50:48.164354", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:48.277592", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.018953487277030945, "timestamp": "2025-09-04 03:50:48.297957", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:48.396340", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.024002674967050552, "timestamp": "2025-09-04 03:50:48.413527", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:50:48.521196", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.004336449783295393, "timestamp": "2025-09-04 03:50:48.542030", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:48.633156", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.0077681634575128555, "timestamp": "2025-09-04 03:50:48.652155", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:48.745478", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.03083970956504345, "timestamp": "2025-09-04 03:50:48.762789", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:48.872150", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.012532150372862816, "timestamp": "2025-09-04 03:50:48.892489", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:48.993220", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.060628145933151245, "timestamp": "2025-09-04 03:50:49.012649", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:49.103744", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.008690881542861462, "timestamp": "2025-09-04 03:50:49.122962", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:50:49.223690", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.014455270953476429, "timestamp": "2025-09-04 03:50:49.242713", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:49.336934", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.013105835765600204, "timestamp": "2025-09-04 03:50:49.354425", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:50:49.439534", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.07032399624586105, "timestamp": "2025-09-04 03:50:49.455865", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:50:49.528709", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.07407846301794052, "timestamp": "2025-09-04 03:50:49.543655", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:50:49.651803", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.009337909519672394, "timestamp": "2025-09-04 03:50:49.672288", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:49.766720", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.017982447519898415, "timestamp": "2025-09-04 03:50:49.784007", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:50:49.889673", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.009215055964887142, "timestamp": "2025-09-04 03:50:49.910355", "step": 660, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:50:58.286370", "step": 660, "epoch": 1 }, { "type": "pplx", "content": 330.0641296778058, "timestamp": "2025-09-04 03:50:58.288230", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:50:58.386848", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.02920273132622242, "timestamp": "2025-09-04 03:50:58.408025", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:50:58.498758", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.007981177419424057, "timestamp": "2025-09-04 03:50:58.515280", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:58.615295", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.017763612791895866, "timestamp": "2025-09-04 03:50:58.633595", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:50:58.734437", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.013931555673480034, "timestamp": "2025-09-04 03:50:58.753624", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:50:58.858226", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.013805502094328403, "timestamp": "2025-09-04 03:50:58.880186", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:50:58.975496", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.040271319448947906, "timestamp": "2025-09-04 03:50:58.992762", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:50:59.088259", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.008608100935816765, "timestamp": "2025-09-04 03:50:59.105528", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:50:59.215099", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.018997695297002792, "timestamp": "2025-09-04 03:50:59.235830", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:50:59.340368", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.04310667887330055, "timestamp": "2025-09-04 03:50:59.362359", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:50:59.436010", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.01862409897148609, "timestamp": "2025-09-04 03:50:59.448669", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:50:59.533601", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.032819997519254684, "timestamp": "2025-09-04 03:50:59.548822", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:50:59.655577", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.024017833173274994, "timestamp": "2025-09-04 03:50:59.676063", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:50:59.767083", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.026013823226094246, "timestamp": "2025-09-04 03:50:59.785657", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:50:59.879185", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.0160057432949543, "timestamp": "2025-09-04 03:50:59.896099", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:00.000068", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.016003988683223724, "timestamp": "2025-09-04 03:51:00.019146", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:51:00.115125", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.021060237661004066, "timestamp": "2025-09-04 03:51:00.133211", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:00.236367", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.05808281898498535, "timestamp": "2025-09-04 03:51:00.258278", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 03:51:00.421947", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.013201756402850151, "timestamp": "2025-09-04 03:51:00.453778", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:51:00.538723", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.02031007781624794, "timestamp": "2025-09-04 03:51:00.553892", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:00.657200", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.034204911440610886, "timestamp": "2025-09-04 03:51:00.676840", "step": 680, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:51:09.031779", "step": 680, "epoch": 1 }, { "type": "pplx", "content": 330.2300531347332, "timestamp": "2025-09-04 03:51:09.033666", "step": 680, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 680", "timestamp": "2025-09-04 03:51:09.446722", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:51:09.542085", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.06078000366687775, "timestamp": "2025-09-04 03:51:09.562352", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:09.664340", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.017026284709572792, "timestamp": "2025-09-04 03:51:09.683255", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:09.786570", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.03394745662808418, "timestamp": "2025-09-04 03:51:09.805775", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:09.908393", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.03497334569692612, "timestamp": "2025-09-04 03:51:09.928050", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:51:10.017677", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.025791537016630173, "timestamp": "2025-09-04 03:51:10.036239", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:51:10.152751", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.049982357770204544, "timestamp": "2025-09-04 03:51:10.174866", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:10.265581", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.023276111111044884, "timestamp": "2025-09-04 03:51:10.282308", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:51:10.378533", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.016552327200770378, "timestamp": "2025-09-04 03:51:10.396569", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:51:10.513537", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.014692885801196098, "timestamp": "2025-09-04 03:51:10.537386", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:10.643966", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.08659341931343079, "timestamp": "2025-09-04 03:51:10.663724", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:10.741883", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.030853156000375748, "timestamp": "2025-09-04 03:51:10.755931", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:51:10.839386", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.030888579785823822, "timestamp": "2025-09-04 03:51:10.854966", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:51:10.936300", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.03811494633555412, "timestamp": "2025-09-04 03:51:10.952811", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:11.058803", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.013888245448470116, "timestamp": "2025-09-04 03:51:11.078585", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:51:11.214227", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.03252805024385452, "timestamp": "2025-09-04 03:51:11.239966", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:51:11.325914", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.057707663625478745, "timestamp": "2025-09-04 03:51:11.342329", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:51:11.414814", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.013484358787536621, "timestamp": "2025-09-04 03:51:11.429372", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:51:11.501648", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.05915412679314613, "timestamp": "2025-09-04 03:51:11.514316", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:51:11.610602", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.024636950343847275, "timestamp": "2025-09-04 03:51:11.627855", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:51:11.721668", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.017293041571974754, "timestamp": "2025-09-04 03:51:11.739406", "step": 700, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:51:20.115650", "step": 700, "epoch": 1 }, { "type": "pplx", "content": 334.05662580498205, "timestamp": "2025-09-04 03:51:20.117602", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:20.216569", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.020366515964269638, "timestamp": "2025-09-04 03:51:20.237665", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:20.345409", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.03450450301170349, "timestamp": "2025-09-04 03:51:20.365186", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:20.442718", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.031207023188471794, "timestamp": "2025-09-04 03:51:20.456727", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:20.548080", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.09872627258300781, "timestamp": "2025-09-04 03:51:20.565328", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:20.663398", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.03153887018561363, "timestamp": "2025-09-04 03:51:20.683915", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:20.786006", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.025656569749116898, "timestamp": "2025-09-04 03:51:20.805069", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:20.905492", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.007625031750649214, "timestamp": "2025-09-04 03:51:20.924018", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:21.001433", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.013403641991317272, "timestamp": "2025-09-04 03:51:21.016042", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:51:21.089639", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.04379529878497124, "timestamp": "2025-09-04 03:51:21.104530", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:21.210734", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.03386307880282402, "timestamp": "2025-09-04 03:51:21.230561", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:21.339667", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.023136505857110023, "timestamp": "2025-09-04 03:51:21.360277", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:21.470082", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.01604538969695568, "timestamp": "2025-09-04 03:51:21.491196", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:51:21.574067", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.0386708602309227, "timestamp": "2025-09-04 03:51:21.590838", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:21.701235", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.00971259456127882, "timestamp": "2025-09-04 03:51:21.721504", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:21.812160", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.06396154314279556, "timestamp": "2025-09-04 03:51:21.829002", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:51:21.928434", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.044354457408189774, "timestamp": "2025-09-04 03:51:21.947527", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:22.037338", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.02656322531402111, "timestamp": "2025-09-04 03:51:22.055648", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:51:22.134993", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.03358420729637146, "timestamp": "2025-09-04 03:51:22.148882", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:22.250678", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.008803064003586769, "timestamp": "2025-09-04 03:51:22.269676", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:22.360223", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.1287224441766739, "timestamp": "2025-09-04 03:51:22.377492", "step": 720, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:51:30.762033", "step": 720, "epoch": 1 }, { "type": "pplx", "content": 336.26355182577953, "timestamp": "2025-09-04 03:51:30.763799", "step": 720, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 720", "timestamp": "2025-09-04 03:51:31.234372", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:51:31.352125", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.00398655841127038, "timestamp": "2025-09-04 03:51:31.377462", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:51:31.461725", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.007859241217374802, "timestamp": "2025-09-04 03:51:31.477012", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:51:31.571329", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.027930831536650658, "timestamp": "2025-09-04 03:51:31.588467", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:51:31.687794", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.008873346261680126, "timestamp": "2025-09-04 03:51:31.706953", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:31.783615", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.06903643906116486, "timestamp": "2025-09-04 03:51:31.798877", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:31.901354", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.05149473994970322, "timestamp": "2025-09-04 03:51:31.920403", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:51:31.996940", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.0763927474617958, "timestamp": "2025-09-04 03:51:32.010521", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:32.113768", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.0026997928507626057, "timestamp": "2025-09-04 03:51:32.133545", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:51:32.210082", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.0394502729177475, "timestamp": "2025-09-04 03:51:32.225171", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:32.334298", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.009021886624395847, "timestamp": "2025-09-04 03:51:32.354559", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:51:32.442076", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.00816608127206564, "timestamp": "2025-09-04 03:51:32.457490", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 03:51:32.583086", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.03670935705304146, "timestamp": "2025-09-04 03:51:32.607621", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:51:32.699631", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.056674499064683914, "timestamp": "2025-09-04 03:51:32.718181", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:51:32.803873", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.14876963198184967, "timestamp": "2025-09-04 03:51:32.818788", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:51:32.912540", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.012708455324172974, "timestamp": "2025-09-04 03:51:32.929701", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:33.039167", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.09964840114116669, "timestamp": "2025-09-04 03:51:33.060181", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:33.161479", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.03136486932635307, "timestamp": "2025-09-04 03:51:33.181948", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:33.285762", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.04356590285897255, "timestamp": "2025-09-04 03:51:33.304795", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:33.405495", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.01183301117271185, "timestamp": "2025-09-04 03:51:33.424143", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:33.503460", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.02132677659392357, "timestamp": "2025-09-04 03:51:33.518178", "step": 740, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:51:41.890219", "step": 740, "epoch": 1 }, { "type": "pplx", "content": 330.76754669519477, "timestamp": "2025-09-04 03:51:41.892697", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:41.994167", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.044686876237392426, "timestamp": "2025-09-04 03:51:42.015959", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:42.126745", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.04039061442017555, "timestamp": "2025-09-04 03:51:42.147095", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:42.249214", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.022664330899715424, "timestamp": "2025-09-04 03:51:42.267518", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:51:42.364434", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.02850925363600254, "timestamp": "2025-09-04 03:51:42.382084", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:42.472301", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.03210241347551346, "timestamp": "2025-09-04 03:51:42.490534", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:51:42.570864", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.0489024817943573, "timestamp": "2025-09-04 03:51:42.584554", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:51:42.693167", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.034052006900310516, "timestamp": "2025-09-04 03:51:42.713106", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:51:42.830107", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.012416801415383816, "timestamp": "2025-09-04 03:51:42.852718", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:51:42.960820", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.05252353101968765, "timestamp": "2025-09-04 03:51:42.983407", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:51:43.060970", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.021606309339404106, "timestamp": "2025-09-04 03:51:43.074914", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:43.165463", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.009972968138754368, "timestamp": "2025-09-04 03:51:43.182006", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:51:43.292002", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.017765356227755547, "timestamp": "2025-09-04 03:51:43.313301", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:43.411771", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.04759746789932251, "timestamp": "2025-09-04 03:51:43.432225", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:51:43.540540", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.023347793146967888, "timestamp": "2025-09-04 03:51:43.560567", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:51:43.644484", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.04734707623720169, "timestamp": "2025-09-04 03:51:43.659595", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:43.768987", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.026827994734048843, "timestamp": "2025-09-04 03:51:43.790268", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:51:43.866644", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.055104807019233704, "timestamp": "2025-09-04 03:51:43.881921", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:43.984924", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.03936297073960304, "timestamp": "2025-09-04 03:51:44.004115", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:51:44.090732", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.07558504492044449, "timestamp": "2025-09-04 03:51:44.106115", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:51:44.208853", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.011148090474307537, "timestamp": "2025-09-04 03:51:44.228842", "step": 760, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:51:52.633240", "step": 760, "epoch": 1 }, { "type": "pplx", "content": 325.2410786741373, "timestamp": "2025-09-04 03:51:52.635164", "step": 760, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 760", "timestamp": "2025-09-04 03:51:53.016229", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:51:53.098194", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.038804348558187485, "timestamp": "2025-09-04 03:51:53.114486", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:53.225215", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.09771209210157394, "timestamp": "2025-09-04 03:51:53.245500", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:51:53.339628", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.015294702723622322, "timestamp": "2025-09-04 03:51:53.356673", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:53.457292", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.06018395349383354, "timestamp": "2025-09-04 03:51:53.476673", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:51:53.573539", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.015591312199831009, "timestamp": "2025-09-04 03:51:53.593665", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:51:53.686695", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.04299828037619591, "timestamp": "2025-09-04 03:51:53.703554", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:53.806707", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.06639469414949417, "timestamp": "2025-09-04 03:51:53.825270", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:51:53.927352", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.06221333146095276, "timestamp": "2025-09-04 03:51:53.947041", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:51:54.053386", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.014062750153243542, "timestamp": "2025-09-04 03:51:54.075653", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:51:54.184801", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.05087198689579964, "timestamp": "2025-09-04 03:51:54.204773", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:54.305488", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.021481147035956383, "timestamp": "2025-09-04 03:51:54.324163", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 03:51:54.394241", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.031000670045614243, "timestamp": "2025-09-04 03:51:54.407850", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:51:54.499948", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.0011418497888371348, "timestamp": "2025-09-04 03:51:54.518917", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:51:54.625581", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.02311205305159092, "timestamp": "2025-09-04 03:51:54.645492", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 03:51:54.780476", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.02674124389886856, "timestamp": "2025-09-04 03:51:54.806404", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:51:54.915425", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.04661679267883301, "timestamp": "2025-09-04 03:51:54.936816", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:51:55.019120", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.04548424482345581, "timestamp": "2025-09-04 03:51:55.036198", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:51:55.125560", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.007166591938585043, "timestamp": "2025-09-04 03:51:55.142205", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:55.244199", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.06729776412248611, "timestamp": "2025-09-04 03:51:55.262780", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:51:55.362266", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.007628207094967365, "timestamp": "2025-09-04 03:51:55.381603", "step": 780, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:03.772736", "step": 780, "epoch": 1 }, { "type": "pplx", "content": 323.20181798344566, "timestamp": "2025-09-04 03:52:03.775082", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:52:03.857745", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.004363184329122305, "timestamp": "2025-09-04 03:52:03.874922", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:03.975884", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.01874414086341858, "timestamp": "2025-09-04 03:52:03.994665", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:52:04.082345", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.0951443463563919, "timestamp": "2025-09-04 03:52:04.097756", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:04.192694", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.09938696026802063, "timestamp": "2025-09-04 03:52:04.210641", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:52:04.323748", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.010282545350492, "timestamp": "2025-09-04 03:52:04.347897", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:04.446431", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.1636459231376648, "timestamp": "2025-09-04 03:52:04.465085", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:52:04.551108", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.022835776209831238, "timestamp": "2025-09-04 03:52:04.566559", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:52:04.688871", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.004113573580980301, "timestamp": "2025-09-04 03:52:04.712558", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:04.813658", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.006749707739800215, "timestamp": "2025-09-04 03:52:04.834669", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:04.935197", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.04576265811920166, "timestamp": "2025-09-04 03:52:04.953879", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:05.047740", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.024862490594387054, "timestamp": "2025-09-04 03:52:05.065114", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:05.165236", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.006702667102217674, "timestamp": "2025-09-04 03:52:05.184551", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:05.276679", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.017640264704823494, "timestamp": "2025-09-04 03:52:05.295657", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:05.390345", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.03208797425031662, "timestamp": "2025-09-04 03:52:05.407251", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:05.508560", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.03511238843202591, "timestamp": "2025-09-04 03:52:05.527172", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:05.620511", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.08914028853178024, "timestamp": "2025-09-04 03:52:05.638141", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:05.741354", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.08203835040330887, "timestamp": "2025-09-04 03:52:05.763282", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:05.867104", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.04981414973735809, "timestamp": "2025-09-04 03:52:05.886381", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:05.985482", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.007873651571571827, "timestamp": "2025-09-04 03:52:06.004060", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:06.106641", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.008369551040232182, "timestamp": "2025-09-04 03:52:06.125955", "step": 800, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:14.582165", "step": 800, "epoch": 1 }, { "type": "pplx", "content": 322.3950710812259, "timestamp": "2025-09-04 03:52:14.584469", "step": 800, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 800", "timestamp": "2025-09-04 03:52:14.943950", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:15.032165", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.013865088112652302, "timestamp": "2025-09-04 03:52:15.050000", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:52:15.145702", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.011338443495333195, "timestamp": "2025-09-04 03:52:15.162426", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:15.266835", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.01849699579179287, "timestamp": "2025-09-04 03:52:15.285675", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 03:52:15.424546", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.046258725225925446, "timestamp": "2025-09-04 03:52:15.450883", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:15.541096", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.06461029499769211, "timestamp": "2025-09-04 03:52:15.558768", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:15.650764", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.010986247099936008, "timestamp": "2025-09-04 03:52:15.667207", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:15.771785", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.002984520047903061, "timestamp": "2025-09-04 03:52:15.790613", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:15.883372", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.014198306016623974, "timestamp": "2025-09-04 03:52:15.900243", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:16.002786", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.03292492404580116, "timestamp": "2025-09-04 03:52:16.023360", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:16.128536", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.031847354024648666, "timestamp": "2025-09-04 03:52:16.147172", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:52:16.225296", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.0046399482525885105, "timestamp": "2025-09-04 03:52:16.238472", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:16.350870", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.028976459056138992, "timestamp": "2025-09-04 03:52:16.371461", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:16.471884", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.06571343541145325, "timestamp": "2025-09-04 03:52:16.491658", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:16.597372", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.004271671175956726, "timestamp": "2025-09-04 03:52:16.615972", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:16.721553", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.04265111684799194, "timestamp": "2025-09-04 03:52:16.740168", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:16.843856", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.0795649066567421, "timestamp": "2025-09-04 03:52:16.862808", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:16.965373", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.007744000293314457, "timestamp": "2025-09-04 03:52:16.985818", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:52:17.097956", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.014986629597842693, "timestamp": "2025-09-04 03:52:17.118420", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:52:17.240914", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.019025737419724464, "timestamp": "2025-09-04 03:52:17.262418", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:17.369933", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.05677224323153496, "timestamp": "2025-09-04 03:52:17.390488", "step": 820, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:25.880722", "step": 820, "epoch": 1 }, { "type": "pplx", "content": 324.48675427637124, "timestamp": "2025-09-04 03:52:25.882971", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:25.983499", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.029658645391464233, "timestamp": "2025-09-04 03:52:26.004547", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:52:26.093326", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.01038702204823494, "timestamp": "2025-09-04 03:52:26.108769", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:26.214631", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.001975145423784852, "timestamp": "2025-09-04 03:52:26.231760", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 03:52:26.416298", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.0497569777071476, "timestamp": "2025-09-04 03:52:26.451588", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:26.569750", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.02662699669599533, "timestamp": "2025-09-04 03:52:26.588463", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:26.697814", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.0038292373064905405, "timestamp": "2025-09-04 03:52:26.718387", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:26.821113", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.07293792068958282, "timestamp": "2025-09-04 03:52:26.840094", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:52:26.923604", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.13208356499671936, "timestamp": "2025-09-04 03:52:26.939479", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:52:27.024504", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.03219275176525116, "timestamp": "2025-09-04 03:52:27.041444", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:27.131547", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.00648118881508708, "timestamp": "2025-09-04 03:52:27.148096", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:27.248076", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.04718891531229019, "timestamp": "2025-09-04 03:52:27.266378", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:27.365554", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.004248224198818207, "timestamp": "2025-09-04 03:52:27.384715", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 03:52:27.565166", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.010911565274000168, "timestamp": "2025-09-04 03:52:27.603065", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:27.702939", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.036790501326322556, "timestamp": "2025-09-04 03:52:27.721421", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:52:27.798834", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.08580999821424484, "timestamp": "2025-09-04 03:52:27.812651", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:27.912675", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.00923218298703432, "timestamp": "2025-09-04 03:52:27.932194", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:28.031507", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.007355087902396917, "timestamp": "2025-09-04 03:52:28.052146", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:28.148182", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.017146985977888107, "timestamp": "2025-09-04 03:52:28.165495", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:28.267178", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.021363992244005203, "timestamp": "2025-09-04 03:52:28.286139", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:28.395064", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.07390808314085007, "timestamp": "2025-09-04 03:52:28.416262", "step": 840, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:36.789891", "step": 840, "epoch": 1 }, { "type": "pplx", "content": 324.41111865398125, "timestamp": "2025-09-04 03:52:36.791946", "step": 840, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 840", "timestamp": "2025-09-04 03:52:37.138794", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:37.228808", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.008902303874492645, "timestamp": "2025-09-04 03:52:37.247382", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:52:37.322184", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.04457436874508858, "timestamp": "2025-09-04 03:52:37.335432", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:52:37.443107", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.011131439357995987, "timestamp": "2025-09-04 03:52:37.463335", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:37.571948", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.03997879475355148, "timestamp": "2025-09-04 03:52:37.593109", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:37.681186", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.05826606974005699, "timestamp": "2025-09-04 03:52:37.699242", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:37.798938", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.013584684580564499, "timestamp": "2025-09-04 03:52:37.817704", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 03:52:38.022798", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.00445243064314127, "timestamp": "2025-09-04 03:52:38.061808", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:52:38.163815", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.022547859698534012, "timestamp": "2025-09-04 03:52:38.183460", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:52:38.289760", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.02277173474431038, "timestamp": "2025-09-04 03:52:38.312237", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:52:38.396170", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.055064212530851364, "timestamp": "2025-09-04 03:52:38.411132", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:52:38.494339", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.009578646160662174, "timestamp": "2025-09-04 03:52:38.509205", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:38.603857", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.006946980953216553, "timestamp": "2025-09-04 03:52:38.621878", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:38.723300", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.10980971902608871, "timestamp": "2025-09-04 03:52:38.744250", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:38.836222", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.02813224121928215, "timestamp": "2025-09-04 03:52:38.852773", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:38.952308", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.04458193480968475, "timestamp": "2025-09-04 03:52:38.970909", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:52:39.048237", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.03821039944887161, "timestamp": "2025-09-04 03:52:39.062827", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:39.152919", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.032031524926424026, "timestamp": "2025-09-04 03:52:39.171567", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:52:39.266660", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.026080451905727386, "timestamp": "2025-09-04 03:52:39.283145", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:39.393922", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.06203945353627205, "timestamp": "2025-09-04 03:52:39.414479", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:39.518098", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.0699310228228569, "timestamp": "2025-09-04 03:52:39.537950", "step": 860, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:47.899492", "step": 860, "epoch": 1 }, { "type": "pplx", "content": 322.49340806315365, "timestamp": "2025-09-04 03:52:47.901562", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:48.003512", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.03510546684265137, "timestamp": "2025-09-04 03:52:48.025476", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:48.129042", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.010071228258311749, "timestamp": "2025-09-04 03:52:48.148376", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:48.251091", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.03212030977010727, "timestamp": "2025-09-04 03:52:48.270097", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:48.375653", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.09204772859811783, "timestamp": "2025-09-04 03:52:48.396300", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:48.486243", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.03038841113448143, "timestamp": "2025-09-04 03:52:48.504904", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:48.606251", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.058843135833740234, "timestamp": "2025-09-04 03:52:48.624971", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 03:52:48.753998", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.011757034808397293, "timestamp": "2025-09-04 03:52:48.778443", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:48.883752", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.055646833032369614, "timestamp": "2025-09-04 03:52:48.903198", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:48.999421", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.007672054693102837, "timestamp": "2025-09-04 03:52:49.019789", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:49.126050", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.04655780270695686, "timestamp": "2025-09-04 03:52:49.145853", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:52:49.239252", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.009051861241459846, "timestamp": "2025-09-04 03:52:49.256412", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:52:49.340677", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.013100282289087772, "timestamp": "2025-09-04 03:52:49.356697", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:52:49.437905", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.017356760799884796, "timestamp": "2025-09-04 03:52:49.454466", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:49.557371", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.02461932599544525, "timestamp": "2025-09-04 03:52:49.576385", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:52:49.683118", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.040192510932683945, "timestamp": "2025-09-04 03:52:49.702845", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:49.803670", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.04767023026943207, "timestamp": "2025-09-04 03:52:49.823120", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:49.930415", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.0067078592255711555, "timestamp": "2025-09-04 03:52:49.952767", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:52:50.051711", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.0437939316034317, "timestamp": "2025-09-04 03:52:50.070336", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:52:50.180351", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.00434476463124156, "timestamp": "2025-09-04 03:52:50.200624", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:50.293161", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.02230999432504177, "timestamp": "2025-09-04 03:52:50.310845", "step": 880, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:52:58.696210", "step": 880, "epoch": 1 }, { "type": "pplx", "content": 320.0679803850578, "timestamp": "2025-09-04 03:52:58.697942", "step": 880, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 880", "timestamp": "2025-09-04 03:52:59.217621", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:59.308241", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.021877720952033997, "timestamp": "2025-09-04 03:52:59.326920", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:52:59.423770", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.03120221011340618, "timestamp": "2025-09-04 03:52:59.441082", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:59.533374", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.017093293368816376, "timestamp": "2025-09-04 03:52:59.550315", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:52:59.654042", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.045136742293834686, "timestamp": "2025-09-04 03:52:59.674000", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:52:59.771460", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.06527303159236908, "timestamp": "2025-09-04 03:52:59.792160", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:52:59.887195", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.01749110408127308, "timestamp": "2025-09-04 03:52:59.904116", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:00.010118", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.006269685458391905, "timestamp": "2025-09-04 03:53:00.029842", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:00.134196", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.012028466910123825, "timestamp": "2025-09-04 03:53:00.154044", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:53:00.265849", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.030681060627102852, "timestamp": "2025-09-04 03:53:00.288336", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:00.394509", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.014401630498468876, "timestamp": "2025-09-04 03:53:00.413831", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:53:00.497526", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.04923472926020622, "timestamp": "2025-09-04 03:53:00.512537", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:00.611406", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.056294139474630356, "timestamp": "2025-09-04 03:53:00.630919", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:53:00.735420", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.05868148058652878, "timestamp": "2025-09-04 03:53:00.755788", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:00.857660", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.04890258610248566, "timestamp": "2025-09-04 03:53:00.876619", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:00.976525", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.024650558829307556, "timestamp": "2025-09-04 03:53:00.995278", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:01.094960", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.007203355897217989, "timestamp": "2025-09-04 03:53:01.114386", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:01.215264", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.04748008772730827, "timestamp": "2025-09-04 03:53:01.236042", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:01.342527", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.011301451362669468, "timestamp": "2025-09-04 03:53:01.362187", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:01.452563", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.03385727107524872, "timestamp": "2025-09-04 03:53:01.469235", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:01.575911", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.037546951323747635, "timestamp": "2025-09-04 03:53:01.595843", "step": 900, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:53:09.952980", "step": 900, "epoch": 1 }, { "type": "pplx", "content": 319.89447585761627, "timestamp": "2025-09-04 03:53:09.954474", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:10.052039", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.01678859256207943, "timestamp": "2025-09-04 03:53:10.073130", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:10.166839", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.038133881986141205, "timestamp": "2025-09-04 03:53:10.183993", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:10.274791", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.019613448530435562, "timestamp": "2025-09-04 03:53:10.291441", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:53:10.379595", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.020688189193606377, "timestamp": "2025-09-04 03:53:10.395757", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:10.487110", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.005966340657323599, "timestamp": "2025-09-04 03:53:10.505940", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:10.607711", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.01767931506037712, "timestamp": "2025-09-04 03:53:10.626611", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:10.719273", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.02633294090628624, "timestamp": "2025-09-04 03:53:10.736157", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:10.839880", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.03189924731850624, "timestamp": "2025-09-04 03:53:10.859731", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:10.962770", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.003187986556440592, "timestamp": "2025-09-04 03:53:10.984587", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:11.084774", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.07128451019525528, "timestamp": "2025-09-04 03:53:11.101434", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:11.213242", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.048821836709976196, "timestamp": "2025-09-04 03:53:11.232355", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:11.347294", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.00408367533236742, "timestamp": "2025-09-04 03:53:11.366967", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:11.473704", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.04687316343188286, "timestamp": "2025-09-04 03:53:11.492664", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:11.586826", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.035709548741579056, "timestamp": "2025-09-04 03:53:11.603966", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:11.707434", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.0572127141058445, "timestamp": "2025-09-04 03:53:11.726695", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:11.816754", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.09570334106683731, "timestamp": "2025-09-04 03:53:11.834156", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:11.932053", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.05318867787718773, "timestamp": "2025-09-04 03:53:11.952724", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:12.054726", "step": 917, "epoch": 1 }, { "type": "loss", "content": 0.008588760159909725, "timestamp": "2025-09-04 03:53:12.073695", "step": 918, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:53:12.212450", "step": 918, "epoch": 1 }, { "type": "loss", "content": 0.007535313721746206, "timestamp": "2025-09-04 03:53:12.238373", "step": 919, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:12.332103", "step": 919, "epoch": 1 }, { "type": "loss", "content": 0.020334357395768166, "timestamp": "2025-09-04 03:53:12.350325", "step": 920, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:53:20.755706", "step": 920, "epoch": 1 }, { "type": "pplx", "content": 323.29331082900484, "timestamp": "2025-09-04 03:53:20.757985", "step": 920, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 920", "timestamp": "2025-09-04 03:53:21.106061", "step": 920, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:21.206709", "step": 920, "epoch": 1 }, { "type": "loss", "content": 0.004637205507606268, "timestamp": "2025-09-04 03:53:21.227753", "step": 921, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:53:21.303208", "step": 921, "epoch": 1 }, { "type": "loss", "content": 0.02929893508553505, "timestamp": "2025-09-04 03:53:21.316393", "step": 922, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:21.409671", "step": 922, "epoch": 1 }, { "type": "loss", "content": 0.05244254320859909, "timestamp": "2025-09-04 03:53:21.426608", "step": 923, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:53:21.561973", "step": 923, "epoch": 1 }, { "type": "loss", "content": 0.014228380285203457, "timestamp": "2025-09-04 03:53:21.588607", "step": 924, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:21.689260", "step": 924, "epoch": 1 }, { "type": "loss", "content": 0.014327945187687874, "timestamp": "2025-09-04 03:53:21.710185", "step": 925, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:53:21.806681", "step": 925, "epoch": 1 }, { "type": "loss", "content": 0.04167867451906204, "timestamp": "2025-09-04 03:53:21.823957", "step": 926, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:21.914206", "step": 926, "epoch": 1 }, { "type": "loss", "content": 0.004316235426813364, "timestamp": "2025-09-04 03:53:21.930710", "step": 927, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:53:22.006415", "step": 927, "epoch": 1 }, { "type": "loss", "content": 0.04848542809486389, "timestamp": "2025-09-04 03:53:22.020751", "step": 928, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:53:22.117538", "step": 928, "epoch": 1 }, { "type": "loss", "content": 0.008897616527974606, "timestamp": "2025-09-04 03:53:22.137654", "step": 929, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:53:22.246947", "step": 929, "epoch": 1 }, { "type": "loss", "content": 0.006201483774930239, "timestamp": "2025-09-04 03:53:22.267302", "step": 930, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:22.361441", "step": 930, "epoch": 1 }, { "type": "loss", "content": 0.06400009244680405, "timestamp": "2025-09-04 03:53:22.378595", "step": 931, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:53:22.502905", "step": 931, "epoch": 1 }, { "type": "loss", "content": 0.009115933440625668, "timestamp": "2025-09-04 03:53:22.526648", "step": 932, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:22.626849", "step": 932, "epoch": 1 }, { "type": "loss", "content": 0.01854240521788597, "timestamp": "2025-09-04 03:53:22.647644", "step": 933, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 03:53:22.821929", "step": 933, "epoch": 1 }, { "type": "loss", "content": 0.015260078944265842, "timestamp": "2025-09-04 03:53:22.854322", "step": 934, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:22.961777", "step": 934, "epoch": 1 }, { "type": "loss", "content": 0.009783388115465641, "timestamp": "2025-09-04 03:53:22.981491", "step": 935, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:23.090151", "step": 935, "epoch": 1 }, { "type": "loss", "content": 0.01038886234164238, "timestamp": "2025-09-04 03:53:23.110577", "step": 936, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:23.199339", "step": 936, "epoch": 1 }, { "type": "loss", "content": 0.04066663980484009, "timestamp": "2025-09-04 03:53:23.217678", "step": 937, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:23.324197", "step": 937, "epoch": 1 }, { "type": "loss", "content": 0.015022533014416695, "timestamp": "2025-09-04 03:53:23.343914", "step": 938, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:53:23.429503", "step": 938, "epoch": 1 }, { "type": "loss", "content": 0.019334964454174042, "timestamp": "2025-09-04 03:53:23.444805", "step": 939, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:23.551488", "step": 939, "epoch": 1 }, { "type": "loss", "content": 0.027674207463860512, "timestamp": "2025-09-04 03:53:23.572051", "step": 940, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:53:31.943440", "step": 940, "epoch": 1 }, { "type": "pplx", "content": 324.7636312669318, "timestamp": "2025-09-04 03:53:31.945581", "step": 940, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:32.047643", "step": 940, "epoch": 1 }, { "type": "loss", "content": 0.004062741529196501, "timestamp": "2025-09-04 03:53:32.069548", "step": 941, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:53:32.273285", "step": 941, "epoch": 1 }, { "type": "loss", "content": 0.11786609143018723, "timestamp": "2025-09-04 03:53:32.312283", "step": 942, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:53:32.390789", "step": 942, "epoch": 1 }, { "type": "loss", "content": 0.012708038091659546, "timestamp": "2025-09-04 03:53:32.404914", "step": 943, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:53:32.523185", "step": 943, "epoch": 1 }, { "type": "loss", "content": 0.05294226482510567, "timestamp": "2025-09-04 03:53:32.546081", "step": 944, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:53:32.652441", "step": 944, "epoch": 1 }, { "type": "loss", "content": 0.06667114049196243, "timestamp": "2025-09-04 03:53:32.674610", "step": 945, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:32.765158", "step": 945, "epoch": 1 }, { "type": "loss", "content": 0.017390718683600426, "timestamp": "2025-09-04 03:53:32.781730", "step": 946, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:53:32.866576", "step": 946, "epoch": 1 }, { "type": "loss", "content": 0.003352563362568617, "timestamp": "2025-09-04 03:53:32.881804", "step": 947, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:32.984148", "step": 947, "epoch": 1 }, { "type": "loss", "content": 0.059882860630750656, "timestamp": "2025-09-04 03:53:33.003916", "step": 948, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:53:33.096962", "step": 948, "epoch": 1 }, { "type": "loss", "content": 0.04028277471661568, "timestamp": "2025-09-04 03:53:33.115970", "step": 949, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 03:53:33.290357", "step": 949, "epoch": 1 }, { "type": "loss", "content": 0.018983660265803337, "timestamp": "2025-09-04 03:53:33.324730", "step": 950, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:33.418146", "step": 950, "epoch": 1 }, { "type": "loss", "content": 0.01843661069869995, "timestamp": "2025-09-04 03:53:33.435067", "step": 951, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:53:33.518359", "step": 951, "epoch": 1 }, { "type": "loss", "content": 0.029021859169006348, "timestamp": "2025-09-04 03:53:33.534111", "step": 952, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:53:33.617540", "step": 952, "epoch": 1 }, { "type": "loss", "content": 0.01326957531273365, "timestamp": "2025-09-04 03:53:33.634605", "step": 953, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:53:33.713033", "step": 953, "epoch": 1 }, { "type": "loss", "content": 0.0053786844946444035, "timestamp": "2025-09-04 03:53:33.726795", "step": 954, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:33.829268", "step": 954, "epoch": 1 }, { "type": "loss", "content": 0.009815702214837074, "timestamp": "2025-09-04 03:53:33.848427", "step": 955, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:53:33.943180", "step": 955, "epoch": 1 }, { "type": "loss", "content": 0.012396165169775486, "timestamp": "2025-09-04 03:53:33.961066", "step": 956, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:53:34.067090", "step": 956, "epoch": 1 }, { "type": "loss", "content": 0.01327445451170206, "timestamp": "2025-09-04 03:53:34.089193", "step": 957, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:53:34.292580", "step": 957, "epoch": 1 }, { "type": "loss", "content": 0.0052812471985816956, "timestamp": "2025-09-04 03:53:34.331717", "step": 958, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:34.436182", "step": 958, "epoch": 1 }, { "type": "loss", "content": 0.037195418030023575, "timestamp": "2025-09-04 03:53:34.455354", "step": 959, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:53:34.554713", "step": 959, "epoch": 1 }, { "type": "loss", "content": 0.0028625449631363153, "timestamp": "2025-09-04 03:53:34.574149", "step": 960, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:53:42.954180", "step": 960, "epoch": 1 }, { "type": "pplx", "content": 327.4406282794405, "timestamp": "2025-09-04 03:53:42.956295", "step": 960, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 960", "timestamp": "2025-09-04 03:53:43.310072", "step": 960, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:53:43.426090", "step": 960, "epoch": 1 }, { "type": "loss", "content": 0.11174288392066956, "timestamp": "2025-09-04 03:53:43.449893", "step": 961, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:43.552254", "step": 961, "epoch": 1 }, { "type": "loss", "content": 0.06079157814383507, "timestamp": "2025-09-04 03:53:43.571555", "step": 962, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:43.674623", "step": 962, "epoch": 1 }, { "type": "loss", "content": 0.007762270979583263, "timestamp": "2025-09-04 03:53:43.693819", "step": 963, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:53:43.788814", "step": 963, "epoch": 1 }, { "type": "loss", "content": 0.1044481098651886, "timestamp": "2025-09-04 03:53:43.807251", "step": 964, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:53:43.898657", "step": 964, "epoch": 1 }, { "type": "loss", "content": 0.04733144864439964, "timestamp": "2025-09-04 03:53:43.917440", "step": 965, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:44.018358", "step": 965, "epoch": 1 }, { "type": "loss", "content": 0.04005102813243866, "timestamp": "2025-09-04 03:53:44.037363", "step": 966, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:44.139306", "step": 966, "epoch": 1 }, { "type": "loss", "content": 0.015302048996090889, "timestamp": "2025-09-04 03:53:44.158514", "step": 967, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:44.262372", "step": 967, "epoch": 1 }, { "type": "loss", "content": 0.04137500002980232, "timestamp": "2025-09-04 03:53:44.282149", "step": 968, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:53:44.355720", "step": 968, "epoch": 1 }, { "type": "loss", "content": 0.012012657709419727, "timestamp": "2025-09-04 03:53:44.370202", "step": 969, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:44.473736", "step": 969, "epoch": 1 }, { "type": "loss", "content": 0.0035866987891495228, "timestamp": "2025-09-04 03:53:44.492727", "step": 970, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:44.593769", "step": 970, "epoch": 1 }, { "type": "loss", "content": 0.011977934278547764, "timestamp": "2025-09-04 03:53:44.612318", "step": 971, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:53:44.712118", "step": 971, "epoch": 1 }, { "type": "loss", "content": 0.010067245922982693, "timestamp": "2025-09-04 03:53:44.731252", "step": 972, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:44.819828", "step": 972, "epoch": 1 }, { "type": "loss", "content": 0.025547686964273453, "timestamp": "2025-09-04 03:53:44.837978", "step": 973, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:53:44.915441", "step": 973, "epoch": 1 }, { "type": "loss", "content": 0.037151534110307693, "timestamp": "2025-09-04 03:53:44.929215", "step": 974, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 03:53:45.143074", "step": 974, "epoch": 1 }, { "type": "loss", "content": 0.027699746191501617, "timestamp": "2025-09-04 03:53:45.184042", "step": 975, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:53:45.279228", "step": 975, "epoch": 1 }, { "type": "loss", "content": 0.05770542845129967, "timestamp": "2025-09-04 03:53:45.297565", "step": 976, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:53:45.373416", "step": 976, "epoch": 1 }, { "type": "loss", "content": 0.010280147194862366, "timestamp": "2025-09-04 03:53:45.388732", "step": 977, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:45.491256", "step": 977, "epoch": 1 }, { "type": "loss", "content": 0.016326548531651497, "timestamp": "2025-09-04 03:53:45.510479", "step": 978, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 03:53:45.637463", "step": 978, "epoch": 1 }, { "type": "loss", "content": 0.0350724533200264, "timestamp": "2025-09-04 03:53:45.661885", "step": 979, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:45.765390", "step": 979, "epoch": 1 }, { "type": "loss", "content": 0.012970902025699615, "timestamp": "2025-09-04 03:53:45.785448", "step": 980, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:53:54.156759", "step": 980, "epoch": 1 }, { "type": "pplx", "content": 326.69142554401174, "timestamp": "2025-09-04 03:53:54.158794", "step": 980, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:53:54.233062", "step": 980, "epoch": 1 }, { "type": "loss", "content": 0.008149470202624798, "timestamp": "2025-09-04 03:53:54.248341", "step": 981, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:53:54.351582", "step": 981, "epoch": 1 }, { "type": "loss", "content": 0.04477942734956741, "timestamp": "2025-09-04 03:53:54.370852", "step": 982, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:54.471887", "step": 982, "epoch": 1 }, { "type": "loss", "content": 0.04230527952313423, "timestamp": "2025-09-04 03:53:54.490758", "step": 983, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:53:54.569763", "step": 983, "epoch": 1 }, { "type": "loss", "content": 0.021094702184200287, "timestamp": "2025-09-04 03:53:54.584677", "step": 984, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:54.681680", "step": 984, "epoch": 1 }, { "type": "loss", "content": 0.050900258123874664, "timestamp": "2025-09-04 03:53:54.702364", "step": 985, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:53:54.810576", "step": 985, "epoch": 1 }, { "type": "loss", "content": 0.014919820241630077, "timestamp": "2025-09-04 03:53:54.830622", "step": 986, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:54.921462", "step": 986, "epoch": 1 }, { "type": "loss", "content": 0.0671142116189003, "timestamp": "2025-09-04 03:53:54.938186", "step": 987, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:53:55.024616", "step": 987, "epoch": 1 }, { "type": "loss", "content": 0.04901759326457977, "timestamp": "2025-09-04 03:53:55.040769", "step": 988, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:53:55.129148", "step": 988, "epoch": 1 }, { "type": "loss", "content": 0.02041156031191349, "timestamp": "2025-09-04 03:53:55.147314", "step": 989, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:53:55.222313", "step": 989, "epoch": 1 }, { "type": "loss", "content": 0.03128139674663544, "timestamp": "2025-09-04 03:53:55.236053", "step": 990, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:55.328730", "step": 990, "epoch": 1 }, { "type": "loss", "content": 0.1105596199631691, "timestamp": "2025-09-04 03:53:55.345640", "step": 991, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:55.446529", "step": 991, "epoch": 1 }, { "type": "loss", "content": 0.2516331374645233, "timestamp": "2025-09-04 03:53:55.465931", "step": 992, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:53:55.565425", "step": 992, "epoch": 1 }, { "type": "loss", "content": 0.006421744357794523, "timestamp": "2025-09-04 03:53:55.586207", "step": 993, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:55.678970", "step": 993, "epoch": 1 }, { "type": "loss", "content": 0.042763665318489075, "timestamp": "2025-09-04 03:53:55.695888", "step": 994, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:55.797442", "step": 994, "epoch": 1 }, { "type": "loss", "content": 0.004555892664939165, "timestamp": "2025-09-04 03:53:55.816067", "step": 995, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:53:55.922577", "step": 995, "epoch": 1 }, { "type": "loss", "content": 0.04880642145872116, "timestamp": "2025-09-04 03:53:55.943075", "step": 996, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:56.040785", "step": 996, "epoch": 1 }, { "type": "loss", "content": 0.029496213421225548, "timestamp": "2025-09-04 03:53:56.061263", "step": 997, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:53:56.154620", "step": 997, "epoch": 1 }, { "type": "loss", "content": 0.01086695957928896, "timestamp": "2025-09-04 03:53:56.171470", "step": 998, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:53:56.272119", "step": 998, "epoch": 1 }, { "type": "loss", "content": 0.03509443625807762, "timestamp": "2025-09-04 03:53:56.290774", "step": 999, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:53:56.400883", "step": 999, "epoch": 1 }, { "type": "loss", "content": 0.05549605190753937, "timestamp": "2025-09-04 03:53:56.422038", "step": 1000, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:54:04.789966", "step": 1000, "epoch": 1 }, { "type": "pplx", "content": 323.93892790465287, "timestamp": "2025-09-04 03:54:04.792236", "step": 1000, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-04 03:54:05.144251", "step": 1000, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:54:05.217721", "step": 1000, "epoch": 1 }, { "type": "loss", "content": 0.03098498098552227, "timestamp": "2025-09-04 03:54:05.232637", "step": 1001, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:05.333112", "step": 1001, "epoch": 1 }, { "type": "loss", "content": 0.018459502607584, "timestamp": "2025-09-04 03:54:05.352002", "step": 1002, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:54:05.436559", "step": 1002, "epoch": 1 }, { "type": "loss", "content": 0.045362938195466995, "timestamp": "2025-09-04 03:54:05.451997", "step": 1003, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:54:05.546333", "step": 1003, "epoch": 1 }, { "type": "loss", "content": 0.013697554357349873, "timestamp": "2025-09-04 03:54:05.564241", "step": 1004, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:05.656441", "step": 1004, "epoch": 1 }, { "type": "loss", "content": 0.014663312584161758, "timestamp": "2025-09-04 03:54:05.675346", "step": 1005, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:05.767072", "step": 1005, "epoch": 1 }, { "type": "loss", "content": 0.033350620418787, "timestamp": "2025-09-04 03:54:05.783605", "step": 1006, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:05.887104", "step": 1006, "epoch": 1 }, { "type": "loss", "content": 0.0400872677564621, "timestamp": "2025-09-04 03:54:05.906229", "step": 1007, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:54:06.015408", "step": 1007, "epoch": 1 }, { "type": "loss", "content": 0.03101273812353611, "timestamp": "2025-09-04 03:54:06.036432", "step": 1008, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:54:06.129001", "step": 1008, "epoch": 1 }, { "type": "loss", "content": 0.01906219683587551, "timestamp": "2025-09-04 03:54:06.147639", "step": 1009, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:06.250583", "step": 1009, "epoch": 1 }, { "type": "loss", "content": 0.015632882714271545, "timestamp": "2025-09-04 03:54:06.269673", "step": 1010, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:06.373501", "step": 1010, "epoch": 1 }, { "type": "loss", "content": 0.015542350709438324, "timestamp": "2025-09-04 03:54:06.392552", "step": 1011, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:06.486372", "step": 1011, "epoch": 1 }, { "type": "loss", "content": 0.010193181224167347, "timestamp": "2025-09-04 03:54:06.504449", "step": 1012, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:06.604714", "step": 1012, "epoch": 1 }, { "type": "loss", "content": 0.009116173721849918, "timestamp": "2025-09-04 03:54:06.625559", "step": 1013, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:54:06.734943", "step": 1013, "epoch": 1 }, { "type": "loss", "content": 0.011737700551748276, "timestamp": "2025-09-04 03:54:06.755347", "step": 1014, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:06.858522", "step": 1014, "epoch": 1 }, { "type": "loss", "content": 0.05300810933113098, "timestamp": "2025-09-04 03:54:06.877568", "step": 1015, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:54:06.964396", "step": 1015, "epoch": 1 }, { "type": "loss", "content": 0.016496986150741577, "timestamp": "2025-09-04 03:54:06.980756", "step": 1016, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:54:07.063396", "step": 1016, "epoch": 1 }, { "type": "loss", "content": 0.06345777213573456, "timestamp": "2025-09-04 03:54:07.080148", "step": 1017, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:07.171364", "step": 1017, "epoch": 1 }, { "type": "loss", "content": 0.01955530233681202, "timestamp": "2025-09-04 03:54:07.188018", "step": 1018, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:07.288317", "step": 1018, "epoch": 1 }, { "type": "loss", "content": 0.009021622128784657, "timestamp": "2025-09-04 03:54:07.306933", "step": 1019, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:07.407378", "step": 1019, "epoch": 1 }, { "type": "loss", "content": 0.02454635314643383, "timestamp": "2025-09-04 03:54:07.426759", "step": 1020, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:54:15.807366", "step": 1020, "epoch": 1 }, { "type": "pplx", "content": 322.27552972591826, "timestamp": "2025-09-04 03:54:15.809410", "step": 1020, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:54:15.888783", "step": 1020, "epoch": 1 }, { "type": "loss", "content": 0.04511036351323128, "timestamp": "2025-09-04 03:54:15.905147", "step": 1021, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:54:16.013227", "step": 1021, "epoch": 1 }, { "type": "loss", "content": 0.12731988728046417, "timestamp": "2025-09-04 03:54:16.033415", "step": 1022, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 03:54:16.169915", "step": 1022, "epoch": 1 }, { "type": "loss", "content": 0.05131891369819641, "timestamp": "2025-09-04 03:54:16.195930", "step": 1023, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:16.297930", "step": 1023, "epoch": 1 }, { "type": "loss", "content": 0.020861327648162842, "timestamp": "2025-09-04 03:54:16.317755", "step": 1024, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:54:16.420449", "step": 1024, "epoch": 1 }, { "type": "loss", "content": 0.05731099098920822, "timestamp": "2025-09-04 03:54:16.442256", "step": 1025, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:16.544840", "step": 1025, "epoch": 1 }, { "type": "loss", "content": 0.04736688733100891, "timestamp": "2025-09-04 03:54:16.563934", "step": 1026, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:16.662732", "step": 1026, "epoch": 1 }, { "type": "loss", "content": 0.024726245552301407, "timestamp": "2025-09-04 03:54:16.681418", "step": 1027, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:54:16.764441", "step": 1027, "epoch": 1 }, { "type": "loss", "content": 0.07053575664758682, "timestamp": "2025-09-04 03:54:16.780300", "step": 1028, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:54:16.886324", "step": 1028, "epoch": 1 }, { "type": "loss", "content": 0.003608345054090023, "timestamp": "2025-09-04 03:54:16.908667", "step": 1029, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:54:17.003170", "step": 1029, "epoch": 1 }, { "type": "loss", "content": 0.01059285830706358, "timestamp": "2025-09-04 03:54:17.020000", "step": 1030, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:17.120442", "step": 1030, "epoch": 1 }, { "type": "loss", "content": 0.026861051097512245, "timestamp": "2025-09-04 03:54:17.139094", "step": 1031, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 03:54:17.313090", "step": 1031, "epoch": 1 }, { "type": "loss", "content": 0.012563884258270264, "timestamp": "2025-09-04 03:54:17.346335", "step": 1032, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:54:17.453087", "step": 1032, "epoch": 1 }, { "type": "loss", "content": 0.008008824661374092, "timestamp": "2025-09-04 03:54:17.475593", "step": 1033, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:17.578569", "step": 1033, "epoch": 1 }, { "type": "loss", "content": 0.004494689870625734, "timestamp": "2025-09-04 03:54:17.597833", "step": 1034, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:54:17.676279", "step": 1034, "epoch": 1 }, { "type": "loss", "content": 0.05889247730374336, "timestamp": "2025-09-04 03:54:17.690066", "step": 1035, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:17.794363", "step": 1035, "epoch": 1 }, { "type": "loss", "content": 0.03641991689801216, "timestamp": "2025-09-04 03:54:17.814291", "step": 1036, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:17.915191", "step": 1036, "epoch": 1 }, { "type": "loss", "content": 0.015427891165018082, "timestamp": "2025-09-04 03:54:17.936059", "step": 1037, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:54:18.047638", "step": 1037, "epoch": 1 }, { "type": "loss", "content": 0.04161277413368225, "timestamp": "2025-09-04 03:54:18.068089", "step": 1038, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:18.169531", "step": 1038, "epoch": 1 }, { "type": "loss", "content": 0.013311103917658329, "timestamp": "2025-09-04 03:54:18.188142", "step": 1039, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:18.289767", "step": 1039, "epoch": 1 }, { "type": "loss", "content": 0.056311286985874176, "timestamp": "2025-09-04 03:54:18.309120", "step": 1040, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:54:26.699707", "step": 1040, "epoch": 1 }, { "type": "pplx", "content": 325.4783086719936, "timestamp": "2025-09-04 03:54:26.702081", "step": 1040, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1040", "timestamp": "2025-09-04 03:54:27.217343", "step": 1040, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:54:27.293442", "step": 1040, "epoch": 1 }, { "type": "loss", "content": 0.005170788615942001, "timestamp": "2025-09-04 03:54:27.308800", "step": 1041, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:27.411906", "step": 1041, "epoch": 1 }, { "type": "loss", "content": 0.0431942418217659, "timestamp": "2025-09-04 03:54:27.430779", "step": 1042, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 03:54:27.562183", "step": 1042, "epoch": 1 }, { "type": "loss", "content": 0.003155430080369115, "timestamp": "2025-09-04 03:54:27.585655", "step": 1043, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:27.676253", "step": 1043, "epoch": 1 }, { "type": "loss", "content": 0.031002743169665337, "timestamp": "2025-09-04 03:54:27.693852", "step": 1044, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1008 ], "flops": 20160122450880.0 }, "timestamp": "2025-09-04 03:54:27.835708", "step": 1044, "epoch": 1 }, { "type": "loss", "content": 0.0061140297912061214, "timestamp": "2025-09-04 03:54:27.866848", "step": 1045, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:27.970250", "step": 1045, "epoch": 1 }, { "type": "loss", "content": 0.024821752682328224, "timestamp": "2025-09-04 03:54:27.989454", "step": 1046, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:54:28.097849", "step": 1046, "epoch": 1 }, { "type": "loss", "content": 0.02534925378859043, "timestamp": "2025-09-04 03:54:28.118164", "step": 1047, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:28.217521", "step": 1047, "epoch": 1 }, { "type": "loss", "content": 0.06829417496919632, "timestamp": "2025-09-04 03:54:28.236845", "step": 1048, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:28.337902", "step": 1048, "epoch": 1 }, { "type": "loss", "content": 0.011538490653038025, "timestamp": "2025-09-04 03:54:28.359059", "step": 1049, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:28.460838", "step": 1049, "epoch": 1 }, { "type": "loss", "content": 0.017921442165970802, "timestamp": "2025-09-04 03:54:28.479791", "step": 1050, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:54:28.557920", "step": 1050, "epoch": 1 }, { "type": "loss", "content": 0.00594189902767539, "timestamp": "2025-09-04 03:54:28.572056", "step": 1051, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:28.661977", "step": 1051, "epoch": 1 }, { "type": "loss", "content": 0.06706573069095612, "timestamp": "2025-09-04 03:54:28.679555", "step": 1052, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:54:28.793603", "step": 1052, "epoch": 1 }, { "type": "loss", "content": 0.07539967447519302, "timestamp": "2025-09-04 03:54:28.817899", "step": 1053, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:28.911701", "step": 1053, "epoch": 1 }, { "type": "loss", "content": 0.01035250723361969, "timestamp": "2025-09-04 03:54:28.929113", "step": 1054, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:54:29.003669", "step": 1054, "epoch": 1 }, { "type": "loss", "content": 0.024351386353373528, "timestamp": "2025-09-04 03:54:29.017233", "step": 1055, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:29.122010", "step": 1055, "epoch": 1 }, { "type": "loss", "content": 0.041539065539836884, "timestamp": "2025-09-04 03:54:29.141863", "step": 1056, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:29.238138", "step": 1056, "epoch": 1 }, { "type": "loss", "content": 0.03460566699504852, "timestamp": "2025-09-04 03:54:29.258519", "step": 1057, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:54:29.393251", "step": 1057, "epoch": 1 }, { "type": "loss", "content": 0.005672653205692768, "timestamp": "2025-09-04 03:54:29.418907", "step": 1058, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:29.520526", "step": 1058, "epoch": 1 }, { "type": "loss", "content": 0.04056652635335922, "timestamp": "2025-09-04 03:54:29.539428", "step": 1059, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:54:29.625500", "step": 1059, "epoch": 1 }, { "type": "loss", "content": 0.018356602638959885, "timestamp": "2025-09-04 03:54:29.641923", "step": 1060, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:54:38.037867", "step": 1060, "epoch": 1 }, { "type": "pplx", "content": 329.5744553045225, "timestamp": "2025-09-04 03:54:38.039700", "step": 1060, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:38.137979", "step": 1060, "epoch": 1 }, { "type": "loss", "content": 0.018076708540320396, "timestamp": "2025-09-04 03:54:38.159151", "step": 1061, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:54:38.238436", "step": 1061, "epoch": 1 }, { "type": "loss", "content": 0.008117503486573696, "timestamp": "2025-09-04 03:54:38.252549", "step": 1062, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:54:38.330718", "step": 1062, "epoch": 1 }, { "type": "loss", "content": 0.09541762620210648, "timestamp": "2025-09-04 03:54:38.344655", "step": 1063, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:54:38.455471", "step": 1063, "epoch": 1 }, { "type": "loss", "content": 0.004544029477983713, "timestamp": "2025-09-04 03:54:38.476657", "step": 1064, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:54:38.553048", "step": 1064, "epoch": 1 }, { "type": "loss", "content": 0.018085738644003868, "timestamp": "2025-09-04 03:54:38.568555", "step": 1065, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:38.672167", "step": 1065, "epoch": 1 }, { "type": "loss", "content": 0.008873535320162773, "timestamp": "2025-09-04 03:54:38.691241", "step": 1066, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:54:38.798973", "step": 1066, "epoch": 1 }, { "type": "loss", "content": 0.016320547088980675, "timestamp": "2025-09-04 03:54:38.818751", "step": 1067, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:38.923333", "step": 1067, "epoch": 1 }, { "type": "loss", "content": 0.04694967344403267, "timestamp": "2025-09-04 03:54:38.943139", "step": 1068, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:54:39.051635", "step": 1068, "epoch": 1 }, { "type": "loss", "content": 0.05457077920436859, "timestamp": "2025-09-04 03:54:39.073300", "step": 1069, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:39.169792", "step": 1069, "epoch": 1 }, { "type": "loss", "content": 0.006687942426651716, "timestamp": "2025-09-04 03:54:39.186448", "step": 1070, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:39.281245", "step": 1070, "epoch": 1 }, { "type": "loss", "content": 0.019025299698114395, "timestamp": "2025-09-04 03:54:39.298353", "step": 1071, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:39.390091", "step": 1071, "epoch": 1 }, { "type": "loss", "content": 0.020227275788784027, "timestamp": "2025-09-04 03:54:39.407340", "step": 1072, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:54:39.500568", "step": 1072, "epoch": 1 }, { "type": "loss", "content": 0.05452266335487366, "timestamp": "2025-09-04 03:54:39.519561", "step": 1073, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:54:39.604031", "step": 1073, "epoch": 1 }, { "type": "loss", "content": 0.027135973796248436, "timestamp": "2025-09-04 03:54:39.618960", "step": 1074, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:39.718358", "step": 1074, "epoch": 1 }, { "type": "loss", "content": 0.027994517236948013, "timestamp": "2025-09-04 03:54:39.736747", "step": 1075, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:54:39.843238", "step": 1075, "epoch": 1 }, { "type": "loss", "content": 0.023070676252245903, "timestamp": "2025-09-04 03:54:39.863118", "step": 1076, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:39.963035", "step": 1076, "epoch": 1 }, { "type": "loss", "content": 0.01172536239027977, "timestamp": "2025-09-04 03:54:39.983911", "step": 1077, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:54:40.070577", "step": 1077, "epoch": 1 }, { "type": "loss", "content": 0.029984669759869576, "timestamp": "2025-09-04 03:54:40.086018", "step": 1078, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:40.179735", "step": 1078, "epoch": 1 }, { "type": "loss", "content": 0.024687113240361214, "timestamp": "2025-09-04 03:54:40.196981", "step": 1079, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:40.300861", "step": 1079, "epoch": 1 }, { "type": "loss", "content": 0.008470497094094753, "timestamp": "2025-09-04 03:54:40.320739", "step": 1080, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:54:48.711208", "step": 1080, "epoch": 1 }, { "type": "pplx", "content": 331.8358204811036, "timestamp": "2025-09-04 03:54:48.713485", "step": 1080, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1080", "timestamp": "2025-09-04 03:54:49.226299", "step": 1080, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:49.323088", "step": 1080, "epoch": 1 }, { "type": "loss", "content": 0.08154661953449249, "timestamp": "2025-09-04 03:54:49.343622", "step": 1081, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:54:49.448147", "step": 1081, "epoch": 1 }, { "type": "loss", "content": 0.028146987780928612, "timestamp": "2025-09-04 03:54:49.468034", "step": 1082, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:49.571712", "step": 1082, "epoch": 1 }, { "type": "loss", "content": 0.024996791034936905, "timestamp": "2025-09-04 03:54:49.590885", "step": 1083, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1424 ], "flops": 28480172958272.0 }, "timestamp": "2025-09-04 03:54:49.802309", "step": 1083, "epoch": 1 }, { "type": "loss", "content": 0.03116009198129177, "timestamp": "2025-09-04 03:54:49.843613", "step": 1084, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:49.941437", "step": 1084, "epoch": 1 }, { "type": "loss", "content": 0.032337624579668045, "timestamp": "2025-09-04 03:54:49.961390", "step": 1085, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:54:50.058815", "step": 1085, "epoch": 1 }, { "type": "loss", "content": 0.07175617665052414, "timestamp": "2025-09-04 03:54:50.075368", "step": 1086, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:50.169947", "step": 1086, "epoch": 1 }, { "type": "loss", "content": 0.0184723399579525, "timestamp": "2025-09-04 03:54:50.186963", "step": 1087, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:50.291628", "step": 1087, "epoch": 1 }, { "type": "loss", "content": 0.03585413470864296, "timestamp": "2025-09-04 03:54:50.311636", "step": 1088, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:54:50.392897", "step": 1088, "epoch": 1 }, { "type": "loss", "content": 0.03618989884853363, "timestamp": "2025-09-04 03:54:50.409370", "step": 1089, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:54:50.518834", "step": 1089, "epoch": 1 }, { "type": "loss", "content": 0.05040483921766281, "timestamp": "2025-09-04 03:54:50.538855", "step": 1090, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:54:50.639859", "step": 1090, "epoch": 1 }, { "type": "loss", "content": 0.05431760847568512, "timestamp": "2025-09-04 03:54:50.658560", "step": 1091, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:54:50.753968", "step": 1091, "epoch": 1 }, { "type": "loss", "content": 0.10624273121356964, "timestamp": "2025-09-04 03:54:50.772035", "step": 1092, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:54:50.844993", "step": 1092, "epoch": 1 }, { "type": "loss", "content": 0.03237966448068619, "timestamp": "2025-09-04 03:54:50.859614", "step": 1093, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:54:50.958245", "step": 1093, "epoch": 1 }, { "type": "loss", "content": 0.04587221145629883, "timestamp": "2025-09-04 03:54:50.976639", "step": 1094, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:54:51.094402", "step": 1094, "epoch": 1 }, { "type": "loss", "content": 0.028815526515245438, "timestamp": "2025-09-04 03:54:51.116563", "step": 1095, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:54:51.220135", "step": 1095, "epoch": 1 }, { "type": "loss", "content": 0.013644875027239323, "timestamp": "2025-09-04 03:54:51.239889", "step": 1096, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 03:54:51.384380", "step": 1096, "epoch": 1 }, { "type": "loss", "content": 0.03480622544884682, "timestamp": "2025-09-04 03:54:51.415384", "step": 1097, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:54:51.523257", "step": 1097, "epoch": 1 }, { "type": "loss", "content": 0.03265468776226044, "timestamp": "2025-09-04 03:54:51.543382", "step": 1098, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:54:51.638835", "step": 1098, "epoch": 1 }, { "type": "loss", "content": 0.003582942998036742, "timestamp": "2025-09-04 03:54:51.656106", "step": 1099, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:54:51.749819", "step": 1099, "epoch": 1 }, { "type": "loss", "content": 0.058865927159786224, "timestamp": "2025-09-04 03:54:51.767754", "step": 1100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:00.169117", "step": 1100, "epoch": 1 }, { "type": "pplx", "content": 330.568579902323, "timestamp": "2025-09-04 03:55:00.171543", "step": 1100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:00.268289", "step": 1100, "epoch": 1 }, { "type": "loss", "content": 0.06865076720714569, "timestamp": "2025-09-04 03:55:00.288903", "step": 1101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:00.390269", "step": 1101, "epoch": 1 }, { "type": "loss", "content": 0.10805238038301468, "timestamp": "2025-09-04 03:55:00.408882", "step": 1102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:55:00.484730", "step": 1102, "epoch": 1 }, { "type": "loss", "content": 0.01404650043696165, "timestamp": "2025-09-04 03:55:00.498048", "step": 1103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:55:00.590116", "step": 1103, "epoch": 1 }, { "type": "loss", "content": 0.06355747580528259, "timestamp": "2025-09-04 03:55:00.607459", "step": 1104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:55:00.714230", "step": 1104, "epoch": 1 }, { "type": "loss", "content": 0.02257397770881653, "timestamp": "2025-09-04 03:55:00.736553", "step": 1105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:00.835157", "step": 1105, "epoch": 1 }, { "type": "loss", "content": 0.03384973481297493, "timestamp": "2025-09-04 03:55:00.852475", "step": 1106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:00.956418", "step": 1106, "epoch": 1 }, { "type": "loss", "content": 0.001299434108659625, "timestamp": "2025-09-04 03:55:00.975594", "step": 1107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:55:01.051270", "step": 1107, "epoch": 1 }, { "type": "loss", "content": 0.02580624632537365, "timestamp": "2025-09-04 03:55:01.065411", "step": 1108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:55:01.157686", "step": 1108, "epoch": 1 }, { "type": "loss", "content": 0.04326159134507179, "timestamp": "2025-09-04 03:55:01.176243", "step": 1109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:01.277634", "step": 1109, "epoch": 1 }, { "type": "loss", "content": 0.05120621249079704, "timestamp": "2025-09-04 03:55:01.296313", "step": 1110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:01.399016", "step": 1110, "epoch": 1 }, { "type": "loss", "content": 0.0068900627084076405, "timestamp": "2025-09-04 03:55:01.417975", "step": 1111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:01.528856", "step": 1111, "epoch": 1 }, { "type": "loss", "content": 0.018264418467879295, "timestamp": "2025-09-04 03:55:01.550027", "step": 1112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:55:01.630695", "step": 1112, "epoch": 1 }, { "type": "loss", "content": 0.022158373147249222, "timestamp": "2025-09-04 03:55:01.647141", "step": 1113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 03:55:01.818413", "step": 1113, "epoch": 1 }, { "type": "loss", "content": 0.027837276458740234, "timestamp": "2025-09-04 03:55:01.853035", "step": 1114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:01.955510", "step": 1114, "epoch": 1 }, { "type": "loss", "content": 0.044819965958595276, "timestamp": "2025-09-04 03:55:01.974485", "step": 1115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:02.076943", "step": 1115, "epoch": 1 }, { "type": "loss", "content": 0.040360935032367706, "timestamp": "2025-09-04 03:55:02.096612", "step": 1116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:02.197548", "step": 1116, "epoch": 1 }, { "type": "loss", "content": 0.030223630368709564, "timestamp": "2025-09-04 03:55:02.218582", "step": 1117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:55:02.293568", "step": 1117, "epoch": 1 }, { "type": "loss", "content": 0.04847509413957596, "timestamp": "2025-09-04 03:55:02.306906", "step": 1118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:02.401949", "step": 1118, "epoch": 1 }, { "type": "loss", "content": 0.0198849868029356, "timestamp": "2025-09-04 03:55:02.419194", "step": 1119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:02.504757", "step": 1119, "epoch": 1 }, { "type": "loss", "content": 0.0160811897367239, "timestamp": "2025-09-04 03:55:02.520772", "step": 1120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:10.996486", "step": 1120, "epoch": 1 }, { "type": "pplx", "content": 333.6826089296669, "timestamp": "2025-09-04 03:55:10.998830", "step": 1120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1120", "timestamp": "2025-09-04 03:55:11.503767", "step": 1120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:55:11.577559", "step": 1120, "epoch": 1 }, { "type": "loss", "content": 0.047653671354055405, "timestamp": "2025-09-04 03:55:11.592267", "step": 1121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:55:11.688189", "step": 1121, "epoch": 1 }, { "type": "loss", "content": 0.02942013181746006, "timestamp": "2025-09-04 03:55:11.705321", "step": 1122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:11.811599", "step": 1122, "epoch": 1 }, { "type": "loss", "content": 0.04028499498963356, "timestamp": "2025-09-04 03:55:11.830641", "step": 1123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:11.919412", "step": 1123, "epoch": 1 }, { "type": "loss", "content": 0.029223607853055, "timestamp": "2025-09-04 03:55:11.935618", "step": 1124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1232 ], "flops": 24640149647168.0 }, "timestamp": "2025-09-04 03:55:12.117352", "step": 1124, "epoch": 1 }, { "type": "loss", "content": 0.100711390376091, "timestamp": "2025-09-04 03:55:12.154447", "step": 1125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:12.241477", "step": 1125, "epoch": 1 }, { "type": "loss", "content": 0.009318535216152668, "timestamp": "2025-09-04 03:55:12.256752", "step": 1126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:12.361415", "step": 1126, "epoch": 1 }, { "type": "loss", "content": 0.049649372696876526, "timestamp": "2025-09-04 03:55:12.380493", "step": 1127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:12.491512", "step": 1127, "epoch": 1 }, { "type": "loss", "content": 0.015808911994099617, "timestamp": "2025-09-04 03:55:12.510709", "step": 1128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 03:55:12.646841", "step": 1128, "epoch": 1 }, { "type": "loss", "content": 0.010250390507280827, "timestamp": "2025-09-04 03:55:12.675085", "step": 1129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:12.763963", "step": 1129, "epoch": 1 }, { "type": "loss", "content": 0.0313245952129364, "timestamp": "2025-09-04 03:55:12.779365", "step": 1130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:55:12.893061", "step": 1130, "epoch": 1 }, { "type": "loss", "content": 0.10398827493190765, "timestamp": "2025-09-04 03:55:12.912994", "step": 1131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:13.017369", "step": 1131, "epoch": 1 }, { "type": "loss", "content": 0.03392321988940239, "timestamp": "2025-09-04 03:55:13.036669", "step": 1132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:55:13.126425", "step": 1132, "epoch": 1 }, { "type": "loss", "content": 0.008513440378010273, "timestamp": "2025-09-04 03:55:13.144531", "step": 1133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:13.247351", "step": 1133, "epoch": 1 }, { "type": "loss", "content": 0.0382850281894207, "timestamp": "2025-09-04 03:55:13.265558", "step": 1134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:55:13.344506", "step": 1134, "epoch": 1 }, { "type": "loss", "content": 0.07204016298055649, "timestamp": "2025-09-04 03:55:13.358254", "step": 1135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:13.444419", "step": 1135, "epoch": 1 }, { "type": "loss", "content": 0.014276178553700447, "timestamp": "2025-09-04 03:55:13.460401", "step": 1136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:13.561239", "step": 1136, "epoch": 1 }, { "type": "loss", "content": 0.0037956838496029377, "timestamp": "2025-09-04 03:55:13.580737", "step": 1137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:13.689193", "step": 1137, "epoch": 1 }, { "type": "loss", "content": 0.005481324158608913, "timestamp": "2025-09-04 03:55:13.707893", "step": 1138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:13.809507", "step": 1138, "epoch": 1 }, { "type": "loss", "content": 0.0013551759766414762, "timestamp": "2025-09-04 03:55:13.827493", "step": 1139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:55:13.967598", "step": 1139, "epoch": 1 }, { "type": "loss", "content": 0.0756765827536583, "timestamp": "2025-09-04 03:55:13.994045", "step": 1140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:22.505779", "step": 1140, "epoch": 1 }, { "type": "pplx", "content": 342.1746491135325, "timestamp": "2025-09-04 03:55:22.508181", "step": 1140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:55:22.625590", "step": 1140, "epoch": 1 }, { "type": "loss", "content": 0.01116140466183424, "timestamp": "2025-09-04 03:55:22.651011", "step": 1141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:55:22.855245", "step": 1141, "epoch": 1 }, { "type": "loss", "content": 0.031512316316366196, "timestamp": "2025-09-04 03:55:22.894487", "step": 1142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:23.005275", "step": 1142, "epoch": 1 }, { "type": "loss", "content": 0.010817242786288261, "timestamp": "2025-09-04 03:55:23.025885", "step": 1143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:23.122786", "step": 1143, "epoch": 1 }, { "type": "loss", "content": 0.011584990657866001, "timestamp": "2025-09-04 03:55:23.141003", "step": 1144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:55:23.232853", "step": 1144, "epoch": 1 }, { "type": "loss", "content": 0.01660754159092903, "timestamp": "2025-09-04 03:55:23.251496", "step": 1145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:55:23.362256", "step": 1145, "epoch": 1 }, { "type": "loss", "content": 0.042043667286634445, "timestamp": "2025-09-04 03:55:23.382433", "step": 1146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:23.491895", "step": 1146, "epoch": 1 }, { "type": "loss", "content": 0.04430728405714035, "timestamp": "2025-09-04 03:55:23.512635", "step": 1147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:23.616816", "step": 1147, "epoch": 1 }, { "type": "loss", "content": 0.04766961559653282, "timestamp": "2025-09-04 03:55:23.636716", "step": 1148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:23.729421", "step": 1148, "epoch": 1 }, { "type": "loss", "content": 0.02647322788834572, "timestamp": "2025-09-04 03:55:23.748507", "step": 1149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:23.833529", "step": 1149, "epoch": 1 }, { "type": "loss", "content": 0.008523870259523392, "timestamp": "2025-09-04 03:55:23.849020", "step": 1150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:55:23.939069", "step": 1150, "epoch": 1 }, { "type": "loss", "content": 0.03426166996359825, "timestamp": "2025-09-04 03:55:23.955715", "step": 1151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:24.040437", "step": 1151, "epoch": 1 }, { "type": "loss", "content": 0.0056257485412061214, "timestamp": "2025-09-04 03:55:24.056529", "step": 1152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:55:24.148754", "step": 1152, "epoch": 1 }, { "type": "loss", "content": 0.03579988703131676, "timestamp": "2025-09-04 03:55:24.167610", "step": 1153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:24.270568", "step": 1153, "epoch": 1 }, { "type": "loss", "content": 0.021155666559934616, "timestamp": "2025-09-04 03:55:24.289678", "step": 1154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:24.400847", "step": 1154, "epoch": 1 }, { "type": "loss", "content": 0.020209012553095818, "timestamp": "2025-09-04 03:55:24.421226", "step": 1155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:24.530289", "step": 1155, "epoch": 1 }, { "type": "loss", "content": 0.08458837121725082, "timestamp": "2025-09-04 03:55:24.550087", "step": 1156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:24.632987", "step": 1156, "epoch": 1 }, { "type": "loss", "content": 0.04702272638678551, "timestamp": "2025-09-04 03:55:24.649804", "step": 1157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:55:24.772244", "step": 1157, "epoch": 1 }, { "type": "loss", "content": 0.03629223629832268, "timestamp": "2025-09-04 03:55:24.795123", "step": 1158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:24.906181", "step": 1158, "epoch": 1 }, { "type": "loss", "content": 0.010823562741279602, "timestamp": "2025-09-04 03:55:24.926963", "step": 1159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:25.026738", "step": 1159, "epoch": 1 }, { "type": "loss", "content": 0.015217112377285957, "timestamp": "2025-09-04 03:55:25.045911", "step": 1160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:33.427667", "step": 1160, "epoch": 1 }, { "type": "pplx", "content": 348.55127656576644, "timestamp": "2025-09-04 03:55:33.429586", "step": 1160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1160", "timestamp": "2025-09-04 03:55:33.897323", "step": 1160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:55:34.003968", "step": 1160, "epoch": 1 }, { "type": "loss", "content": 0.05122144892811775, "timestamp": "2025-09-04 03:55:34.026479", "step": 1161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:34.128682", "step": 1161, "epoch": 1 }, { "type": "loss", "content": 0.02695164829492569, "timestamp": "2025-09-04 03:55:34.147589", "step": 1162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:55:34.226537", "step": 1162, "epoch": 1 }, { "type": "loss", "content": 0.07939313352108002, "timestamp": "2025-09-04 03:55:34.240538", "step": 1163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:34.347710", "step": 1163, "epoch": 1 }, { "type": "loss", "content": 0.032340407371520996, "timestamp": "2025-09-04 03:55:34.367546", "step": 1164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:34.468983", "step": 1164, "epoch": 1 }, { "type": "loss", "content": 0.04987955838441849, "timestamp": "2025-09-04 03:55:34.489115", "step": 1165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:55:34.582559", "step": 1165, "epoch": 1 }, { "type": "loss", "content": 0.005085882265120745, "timestamp": "2025-09-04 03:55:34.599866", "step": 1166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 03:55:34.745576", "step": 1166, "epoch": 1 }, { "type": "loss", "content": 0.028625115752220154, "timestamp": "2025-09-04 03:55:34.773627", "step": 1167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 03:55:34.844590", "step": 1167, "epoch": 1 }, { "type": "loss", "content": 0.04103225842118263, "timestamp": "2025-09-04 03:55:34.857925", "step": 1168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:34.960699", "step": 1168, "epoch": 1 }, { "type": "loss", "content": 0.029832925647497177, "timestamp": "2025-09-04 03:55:34.981572", "step": 1169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:55:35.091917", "step": 1169, "epoch": 1 }, { "type": "loss", "content": 0.006762138567864895, "timestamp": "2025-09-04 03:55:35.112008", "step": 1170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 03:55:35.274883", "step": 1170, "epoch": 1 }, { "type": "loss", "content": 0.04091715067625046, "timestamp": "2025-09-04 03:55:35.306668", "step": 1171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:55:35.390858", "step": 1171, "epoch": 1 }, { "type": "loss", "content": 0.07869014889001846, "timestamp": "2025-09-04 03:55:35.406503", "step": 1172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:35.507835", "step": 1172, "epoch": 1 }, { "type": "loss", "content": 0.07957983762025833, "timestamp": "2025-09-04 03:55:35.528848", "step": 1173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 03:55:35.598923", "step": 1173, "epoch": 1 }, { "type": "loss", "content": 0.00997950043529272, "timestamp": "2025-09-04 03:55:35.611323", "step": 1174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:35.722162", "step": 1174, "epoch": 1 }, { "type": "loss", "content": 0.04756058380007744, "timestamp": "2025-09-04 03:55:35.742478", "step": 1175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:35.829004", "step": 1175, "epoch": 1 }, { "type": "loss", "content": 0.046221889555454254, "timestamp": "2025-09-04 03:55:35.845178", "step": 1176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:35.946113", "step": 1176, "epoch": 1 }, { "type": "loss", "content": 0.030887359753251076, "timestamp": "2025-09-04 03:55:35.966861", "step": 1177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:36.067236", "step": 1177, "epoch": 1 }, { "type": "loss", "content": 0.03892279416322708, "timestamp": "2025-09-04 03:55:36.085561", "step": 1178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:55:36.164518", "step": 1178, "epoch": 1 }, { "type": "loss", "content": 0.007792294956743717, "timestamp": "2025-09-04 03:55:36.178652", "step": 1179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:55:36.288591", "step": 1179, "epoch": 1 }, { "type": "loss", "content": 0.05254804715514183, "timestamp": "2025-09-04 03:55:36.309426", "step": 1180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:44.707890", "step": 1180, "epoch": 1 }, { "type": "pplx", "content": 353.37533564087937, "timestamp": "2025-09-04 03:55:44.709798", "step": 1180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:44.791751", "step": 1180, "epoch": 1 }, { "type": "loss", "content": 0.028075462207198143, "timestamp": "2025-09-04 03:55:44.808892", "step": 1181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:44.904297", "step": 1181, "epoch": 1 }, { "type": "loss", "content": 0.062234699726104736, "timestamp": "2025-09-04 03:55:44.921666", "step": 1182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:55:45.029947", "step": 1182, "epoch": 1 }, { "type": "loss", "content": 0.05085289850831032, "timestamp": "2025-09-04 03:55:45.050048", "step": 1183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:55:45.126108", "step": 1183, "epoch": 1 }, { "type": "loss", "content": 0.03087581880390644, "timestamp": "2025-09-04 03:55:45.140649", "step": 1184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:55:45.243704", "step": 1184, "epoch": 1 }, { "type": "loss", "content": 0.008511030115187168, "timestamp": "2025-09-04 03:55:45.265511", "step": 1185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:45.377248", "step": 1185, "epoch": 1 }, { "type": "loss", "content": 0.06436464190483093, "timestamp": "2025-09-04 03:55:45.397541", "step": 1186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:55:45.493257", "step": 1186, "epoch": 1 }, { "type": "loss", "content": 0.01800878904759884, "timestamp": "2025-09-04 03:55:45.510182", "step": 1187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:45.611829", "step": 1187, "epoch": 1 }, { "type": "loss", "content": 0.004938524682074785, "timestamp": "2025-09-04 03:55:45.631169", "step": 1188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1088 ], "flops": 21760132163840.0 }, "timestamp": "2025-09-04 03:55:45.785314", "step": 1188, "epoch": 1 }, { "type": "loss", "content": 0.03547127917408943, "timestamp": "2025-09-04 03:55:45.818526", "step": 1189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:45.923937", "step": 1189, "epoch": 1 }, { "type": "loss", "content": 0.04392065852880478, "timestamp": "2025-09-04 03:55:45.943073", "step": 1190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:55:46.044602", "step": 1190, "epoch": 1 }, { "type": "loss", "content": 0.005546086002141237, "timestamp": "2025-09-04 03:55:46.061763", "step": 1191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:55:46.173039", "step": 1191, "epoch": 1 }, { "type": "loss", "content": 0.019072137773036957, "timestamp": "2025-09-04 03:55:46.193789", "step": 1192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:55:46.285071", "step": 1192, "epoch": 1 }, { "type": "loss", "content": 0.018132086843252182, "timestamp": "2025-09-04 03:55:46.301809", "step": 1193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:46.392004", "step": 1193, "epoch": 1 }, { "type": "loss", "content": 0.04924427345395088, "timestamp": "2025-09-04 03:55:46.407467", "step": 1194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:46.510364", "step": 1194, "epoch": 1 }, { "type": "loss", "content": 0.03797117620706558, "timestamp": "2025-09-04 03:55:46.528840", "step": 1195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:55:46.627211", "step": 1195, "epoch": 1 }, { "type": "loss", "content": 0.040146905928850174, "timestamp": "2025-09-04 03:55:46.644773", "step": 1196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:46.746916", "step": 1196, "epoch": 1 }, { "type": "loss", "content": 0.03447960317134857, "timestamp": "2025-09-04 03:55:46.767915", "step": 1197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:55:46.890588", "step": 1197, "epoch": 1 }, { "type": "loss", "content": 0.019626695662736893, "timestamp": "2025-09-04 03:55:46.911658", "step": 1198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:55:47.004148", "step": 1198, "epoch": 1 }, { "type": "loss", "content": 0.00873672403395176, "timestamp": "2025-09-04 03:55:47.020696", "step": 1199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:55:47.128916", "step": 1199, "epoch": 1 }, { "type": "loss", "content": 0.06406814604997635, "timestamp": "2025-09-04 03:55:47.146364", "step": 1200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:55:55.666983", "step": 1200, "epoch": 1 }, { "type": "pplx", "content": 351.81034139828853, "timestamp": "2025-09-04 03:55:55.668968", "step": 1200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1200", "timestamp": "2025-09-04 03:55:56.192955", "step": 1200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 03:55:56.257064", "step": 1200, "epoch": 1 }, { "type": "loss", "content": 0.00952016282826662, "timestamp": "2025-09-04 03:55:56.268026", "step": 1201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:55:56.351897", "step": 1201, "epoch": 1 }, { "type": "loss", "content": 0.014243190176784992, "timestamp": "2025-09-04 03:55:56.366624", "step": 1202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 03:55:56.497096", "step": 1202, "epoch": 1 }, { "type": "loss", "content": 0.05320248380303383, "timestamp": "2025-09-04 03:55:56.520822", "step": 1203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:56.610589", "step": 1203, "epoch": 1 }, { "type": "loss", "content": 0.05997435748577118, "timestamp": "2025-09-04 03:55:56.626370", "step": 1204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:56.741582", "step": 1204, "epoch": 1 }, { "type": "loss", "content": 0.037900906056165695, "timestamp": "2025-09-04 03:55:56.763717", "step": 1205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:55:56.874415", "step": 1205, "epoch": 1 }, { "type": "loss", "content": 0.06352101266384125, "timestamp": "2025-09-04 03:55:56.894641", "step": 1206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:57.001142", "step": 1206, "epoch": 1 }, { "type": "loss", "content": 0.04056302830576897, "timestamp": "2025-09-04 03:55:57.020228", "step": 1207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:57.123990", "step": 1207, "epoch": 1 }, { "type": "loss", "content": 0.02337026037275791, "timestamp": "2025-09-04 03:55:57.143031", "step": 1208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:55:57.222919", "step": 1208, "epoch": 1 }, { "type": "loss", "content": 0.022142881527543068, "timestamp": "2025-09-04 03:55:57.237961", "step": 1209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:55:57.353958", "step": 1209, "epoch": 1 }, { "type": "loss", "content": 0.008503721095621586, "timestamp": "2025-09-04 03:55:57.374604", "step": 1210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:55:57.481228", "step": 1210, "epoch": 1 }, { "type": "loss", "content": 0.024838682264089584, "timestamp": "2025-09-04 03:55:57.500077", "step": 1211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:57.607060", "step": 1211, "epoch": 1 }, { "type": "loss", "content": 0.016720108687877655, "timestamp": "2025-09-04 03:55:57.626884", "step": 1212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:55:57.716274", "step": 1212, "epoch": 1 }, { "type": "loss", "content": 0.03313479945063591, "timestamp": "2025-09-04 03:55:57.734459", "step": 1213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:57.839337", "step": 1213, "epoch": 1 }, { "type": "loss", "content": 0.03852255269885063, "timestamp": "2025-09-04 03:55:57.858178", "step": 1214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:57.959198", "step": 1214, "epoch": 1 }, { "type": "loss", "content": 0.030346812680363655, "timestamp": "2025-09-04 03:55:57.978050", "step": 1215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:55:58.080948", "step": 1215, "epoch": 1 }, { "type": "loss", "content": 0.013426399789750576, "timestamp": "2025-09-04 03:55:58.100720", "step": 1216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:55:58.219568", "step": 1216, "epoch": 1 }, { "type": "loss", "content": 0.007183226756751537, "timestamp": "2025-09-04 03:55:58.244859", "step": 1217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:55:58.331516", "step": 1217, "epoch": 1 }, { "type": "loss", "content": 0.024007325991988182, "timestamp": "2025-09-04 03:55:58.346913", "step": 1218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:55:58.446232", "step": 1218, "epoch": 1 }, { "type": "loss", "content": 0.01614953577518463, "timestamp": "2025-09-04 03:55:58.464742", "step": 1219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:55:58.566439", "step": 1219, "epoch": 1 }, { "type": "loss", "content": 0.05485903471708298, "timestamp": "2025-09-04 03:55:58.585891", "step": 1220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:56:07.237314", "step": 1220, "epoch": 1 }, { "type": "pplx", "content": 343.1063356120184, "timestamp": "2025-09-04 03:56:07.239254", "step": 1220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:07.337811", "step": 1220, "epoch": 1 }, { "type": "loss", "content": 0.009709849022328854, "timestamp": "2025-09-04 03:56:07.358941", "step": 1221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:07.453863", "step": 1221, "epoch": 1 }, { "type": "loss", "content": 0.0713476687669754, "timestamp": "2025-09-04 03:56:07.471290", "step": 1222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:07.574956", "step": 1222, "epoch": 1 }, { "type": "loss", "content": 0.014135504141449928, "timestamp": "2025-09-04 03:56:07.594208", "step": 1223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:07.694439", "step": 1223, "epoch": 1 }, { "type": "loss", "content": 0.08039960265159607, "timestamp": "2025-09-04 03:56:07.713550", "step": 1224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:07.811723", "step": 1224, "epoch": 1 }, { "type": "loss", "content": 0.006991466507315636, "timestamp": "2025-09-04 03:56:07.832184", "step": 1225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:07.935653", "step": 1225, "epoch": 1 }, { "type": "loss", "content": 0.003120355773717165, "timestamp": "2025-09-04 03:56:07.954778", "step": 1226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:08.063069", "step": 1226, "epoch": 1 }, { "type": "loss", "content": 0.03354727104306221, "timestamp": "2025-09-04 03:56:08.083308", "step": 1227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:56:08.199707", "step": 1227, "epoch": 1 }, { "type": "loss", "content": 0.004579135682433844, "timestamp": "2025-09-04 03:56:08.222359", "step": 1228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:08.315258", "step": 1228, "epoch": 1 }, { "type": "loss", "content": 0.010765165090560913, "timestamp": "2025-09-04 03:56:08.334373", "step": 1229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:56:08.413365", "step": 1229, "epoch": 1 }, { "type": "loss", "content": 0.03472265228629112, "timestamp": "2025-09-04 03:56:08.427468", "step": 1230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:56:08.532931", "step": 1230, "epoch": 1 }, { "type": "loss", "content": 0.028265012428164482, "timestamp": "2025-09-04 03:56:08.552694", "step": 1231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:08.668961", "step": 1231, "epoch": 1 }, { "type": "loss", "content": 0.06229390949010849, "timestamp": "2025-09-04 03:56:08.688691", "step": 1232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:56:08.771497", "step": 1232, "epoch": 1 }, { "type": "loss", "content": 0.013525002636015415, "timestamp": "2025-09-04 03:56:08.787962", "step": 1233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:56:08.905323", "step": 1233, "epoch": 1 }, { "type": "loss", "content": 0.011920424178242683, "timestamp": "2025-09-04 03:56:08.927293", "step": 1234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:56:09.003578", "step": 1234, "epoch": 1 }, { "type": "loss", "content": 0.01954522728919983, "timestamp": "2025-09-04 03:56:09.017136", "step": 1235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:56:09.124546", "step": 1235, "epoch": 1 }, { "type": "loss", "content": 0.002615840407088399, "timestamp": "2025-09-04 03:56:09.145091", "step": 1236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:56:09.253716", "step": 1236, "epoch": 1 }, { "type": "loss", "content": 0.051942598074674606, "timestamp": "2025-09-04 03:56:09.276397", "step": 1237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:09.370402", "step": 1237, "epoch": 1 }, { "type": "loss", "content": 0.006919113453477621, "timestamp": "2025-09-04 03:56:09.387324", "step": 1238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:56:09.472227", "step": 1238, "epoch": 1 }, { "type": "loss", "content": 0.11789951473474503, "timestamp": "2025-09-04 03:56:09.487161", "step": 1239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:09.580412", "step": 1239, "epoch": 1 }, { "type": "loss", "content": 0.031079446896910667, "timestamp": "2025-09-04 03:56:09.598105", "step": 1240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:56:17.965019", "step": 1240, "epoch": 1 }, { "type": "pplx", "content": 339.34815384838885, "timestamp": "2025-09-04 03:56:17.966998", "step": 1240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1240", "timestamp": "2025-09-04 03:56:18.478463", "step": 1240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:56:18.583380", "step": 1240, "epoch": 1 }, { "type": "loss", "content": 0.046426158398389816, "timestamp": "2025-09-04 03:56:18.605824", "step": 1241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:56:18.716749", "step": 1241, "epoch": 1 }, { "type": "loss", "content": 0.009461128152906895, "timestamp": "2025-09-04 03:56:18.737124", "step": 1242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:18.836445", "step": 1242, "epoch": 1 }, { "type": "loss", "content": 0.02478550747036934, "timestamp": "2025-09-04 03:56:18.853794", "step": 1243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 03:56:18.924573", "step": 1243, "epoch": 1 }, { "type": "loss", "content": 0.021543040871620178, "timestamp": "2025-09-04 03:56:18.937862", "step": 1244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:56:19.022085", "step": 1244, "epoch": 1 }, { "type": "loss", "content": 0.0028493471909314394, "timestamp": "2025-09-04 03:56:19.038702", "step": 1245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:19.147377", "step": 1245, "epoch": 1 }, { "type": "loss", "content": 0.03405190631747246, "timestamp": "2025-09-04 03:56:19.165832", "step": 1246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:19.266960", "step": 1246, "epoch": 1 }, { "type": "loss", "content": 0.04478135332465172, "timestamp": "2025-09-04 03:56:19.285942", "step": 1247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:19.388388", "step": 1247, "epoch": 1 }, { "type": "loss", "content": 0.06943120062351227, "timestamp": "2025-09-04 03:56:19.408022", "step": 1248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:19.508182", "step": 1248, "epoch": 1 }, { "type": "loss", "content": 0.01673070713877678, "timestamp": "2025-09-04 03:56:19.528349", "step": 1249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:56:19.635364", "step": 1249, "epoch": 1 }, { "type": "loss", "content": 0.04285313934087753, "timestamp": "2025-09-04 03:56:19.655112", "step": 1250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:19.758825", "step": 1250, "epoch": 1 }, { "type": "loss", "content": 0.007856685668230057, "timestamp": "2025-09-04 03:56:19.777910", "step": 1251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:56:19.852320", "step": 1251, "epoch": 1 }, { "type": "loss", "content": 0.033521927893161774, "timestamp": "2025-09-04 03:56:19.866604", "step": 1252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:19.958870", "step": 1252, "epoch": 1 }, { "type": "loss", "content": 0.05988616868853569, "timestamp": "2025-09-04 03:56:19.977743", "step": 1253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:56:20.083349", "step": 1253, "epoch": 1 }, { "type": "loss", "content": 0.05120409280061722, "timestamp": "2025-09-04 03:56:20.103094", "step": 1254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:56:20.188253", "step": 1254, "epoch": 1 }, { "type": "loss", "content": 0.03182673081755638, "timestamp": "2025-09-04 03:56:20.203436", "step": 1255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:20.296178", "step": 1255, "epoch": 1 }, { "type": "loss", "content": 0.03344005346298218, "timestamp": "2025-09-04 03:56:20.313868", "step": 1256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:56:20.402001", "step": 1256, "epoch": 1 }, { "type": "loss", "content": 0.0048216646537184715, "timestamp": "2025-09-04 03:56:20.420176", "step": 1257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:56:20.501654", "step": 1257, "epoch": 1 }, { "type": "loss", "content": 0.04110339656472206, "timestamp": "2025-09-04 03:56:20.515279", "step": 1258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:20.616816", "step": 1258, "epoch": 1 }, { "type": "loss", "content": 0.0046073743142187595, "timestamp": "2025-09-04 03:56:20.635877", "step": 1259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:20.738003", "step": 1259, "epoch": 1 }, { "type": "loss", "content": 0.009192178025841713, "timestamp": "2025-09-04 03:56:20.757753", "step": 1260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:56:29.142086", "step": 1260, "epoch": 1 }, { "type": "pplx", "content": 340.13528680803216, "timestamp": "2025-09-04 03:56:29.144161", "step": 1260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:56:29.223599", "step": 1260, "epoch": 1 }, { "type": "loss", "content": 0.03510812297463417, "timestamp": "2025-09-04 03:56:29.240057", "step": 1261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:29.348362", "step": 1261, "epoch": 1 }, { "type": "loss", "content": 0.04541385546326637, "timestamp": "2025-09-04 03:56:29.368470", "step": 1262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:29.470427", "step": 1262, "epoch": 1 }, { "type": "loss", "content": 0.021681929007172585, "timestamp": "2025-09-04 03:56:29.489312", "step": 1263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:29.582465", "step": 1263, "epoch": 1 }, { "type": "loss", "content": 0.014316936954855919, "timestamp": "2025-09-04 03:56:29.600152", "step": 1264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:29.704522", "step": 1264, "epoch": 1 }, { "type": "loss", "content": 0.023813286796212196, "timestamp": "2025-09-04 03:56:29.726481", "step": 1265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:56:29.837560", "step": 1265, "epoch": 1 }, { "type": "loss", "content": 0.06364905834197998, "timestamp": "2025-09-04 03:56:29.857947", "step": 1266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 03:56:30.003616", "step": 1266, "epoch": 1 }, { "type": "loss", "content": 0.02237660065293312, "timestamp": "2025-09-04 03:56:30.031688", "step": 1267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:30.132118", "step": 1267, "epoch": 1 }, { "type": "loss", "content": 0.050466664135456085, "timestamp": "2025-09-04 03:56:30.151541", "step": 1268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:30.241954", "step": 1268, "epoch": 1 }, { "type": "loss", "content": 0.01618255488574505, "timestamp": "2025-09-04 03:56:30.260597", "step": 1269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:56:30.344420", "step": 1269, "epoch": 1 }, { "type": "loss", "content": 0.16435541212558746, "timestamp": "2025-09-04 03:56:30.359316", "step": 1270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:30.461700", "step": 1270, "epoch": 1 }, { "type": "loss", "content": 0.03205430880188942, "timestamp": "2025-09-04 03:56:30.480832", "step": 1271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:56:30.558405", "step": 1271, "epoch": 1 }, { "type": "loss", "content": 0.06162261217832565, "timestamp": "2025-09-04 03:56:30.572763", "step": 1272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:56:30.656850", "step": 1272, "epoch": 1 }, { "type": "loss", "content": 0.023222552612423897, "timestamp": "2025-09-04 03:56:30.673770", "step": 1273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:30.781833", "step": 1273, "epoch": 1 }, { "type": "loss", "content": 0.06229608505964279, "timestamp": "2025-09-04 03:56:30.801926", "step": 1274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:56:30.910768", "step": 1274, "epoch": 1 }, { "type": "loss", "content": 0.0057587032206356525, "timestamp": "2025-09-04 03:56:30.931221", "step": 1275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:31.022716", "step": 1275, "epoch": 1 }, { "type": "loss", "content": 0.026004638522863388, "timestamp": "2025-09-04 03:56:31.040502", "step": 1276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:31.130737", "step": 1276, "epoch": 1 }, { "type": "loss", "content": 0.03231319040060043, "timestamp": "2025-09-04 03:56:31.149923", "step": 1277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:31.251356", "step": 1277, "epoch": 1 }, { "type": "loss", "content": 0.02814597263932228, "timestamp": "2025-09-04 03:56:31.270274", "step": 1278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:56:31.344681", "step": 1278, "epoch": 1 }, { "type": "loss", "content": 0.04910106584429741, "timestamp": "2025-09-04 03:56:31.358005", "step": 1279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:56:31.450944", "step": 1279, "epoch": 1 }, { "type": "loss", "content": 0.05825873091816902, "timestamp": "2025-09-04 03:56:31.467043", "step": 1280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:56:39.844352", "step": 1280, "epoch": 1 }, { "type": "pplx", "content": 340.68138853975984, "timestamp": "2025-09-04 03:56:39.846112", "step": 1280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1280", "timestamp": "2025-09-04 03:56:40.198819", "step": 1280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:40.296797", "step": 1280, "epoch": 1 }, { "type": "loss", "content": 0.010946203954517841, "timestamp": "2025-09-04 03:56:40.317552", "step": 1281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:56:40.402157", "step": 1281, "epoch": 1 }, { "type": "loss", "content": 0.01738363690674305, "timestamp": "2025-09-04 03:56:40.417412", "step": 1282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:40.511099", "step": 1282, "epoch": 1 }, { "type": "loss", "content": 0.026985451579093933, "timestamp": "2025-09-04 03:56:40.528410", "step": 1283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:40.621743", "step": 1283, "epoch": 1 }, { "type": "loss", "content": 0.005689225625246763, "timestamp": "2025-09-04 03:56:40.639547", "step": 1284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:40.730975", "step": 1284, "epoch": 1 }, { "type": "loss", "content": 0.03335564583539963, "timestamp": "2025-09-04 03:56:40.749866", "step": 1285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:40.850254", "step": 1285, "epoch": 1 }, { "type": "loss", "content": 0.014564265497028828, "timestamp": "2025-09-04 03:56:40.868661", "step": 1286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:40.967605", "step": 1286, "epoch": 1 }, { "type": "loss", "content": 0.04401400312781334, "timestamp": "2025-09-04 03:56:40.985931", "step": 1287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:41.081033", "step": 1287, "epoch": 1 }, { "type": "loss", "content": 0.031287554651498795, "timestamp": "2025-09-04 03:56:41.099129", "step": 1288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:56:41.208052", "step": 1288, "epoch": 1 }, { "type": "loss", "content": 0.01182449609041214, "timestamp": "2025-09-04 03:56:41.230594", "step": 1289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:56:41.314338", "step": 1289, "epoch": 1 }, { "type": "loss", "content": 0.08080806583166122, "timestamp": "2025-09-04 03:56:41.329180", "step": 1290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:41.433827", "step": 1290, "epoch": 1 }, { "type": "loss", "content": 0.005769069772213697, "timestamp": "2025-09-04 03:56:41.452928", "step": 1291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:41.556488", "step": 1291, "epoch": 1 }, { "type": "loss", "content": 0.007577816490083933, "timestamp": "2025-09-04 03:56:41.576092", "step": 1292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:41.667203", "step": 1292, "epoch": 1 }, { "type": "loss", "content": 0.029849909245967865, "timestamp": "2025-09-04 03:56:41.685829", "step": 1293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:56:41.795843", "step": 1293, "epoch": 1 }, { "type": "loss", "content": 0.04431498050689697, "timestamp": "2025-09-04 03:56:41.816259", "step": 1294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:41.919388", "step": 1294, "epoch": 1 }, { "type": "loss", "content": 0.01207160297781229, "timestamp": "2025-09-04 03:56:41.938071", "step": 1295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 03:56:42.072302", "step": 1295, "epoch": 1 }, { "type": "loss", "content": 0.007530734874308109, "timestamp": "2025-09-04 03:56:42.097544", "step": 1296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:42.187551", "step": 1296, "epoch": 1 }, { "type": "loss", "content": 0.030552592128515244, "timestamp": "2025-09-04 03:56:42.206373", "step": 1297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:56:42.283972", "step": 1297, "epoch": 1 }, { "type": "loss", "content": 0.026529986411333084, "timestamp": "2025-09-04 03:56:42.297966", "step": 1298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:56:42.390767", "step": 1298, "epoch": 1 }, { "type": "loss", "content": 0.06174427270889282, "timestamp": "2025-09-04 03:56:42.407715", "step": 1299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:56:42.524343", "step": 1299, "epoch": 1 }, { "type": "loss", "content": 0.017831875011324883, "timestamp": "2025-09-04 03:56:42.545459", "step": 1300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:56:50.958266", "step": 1300, "epoch": 1 }, { "type": "pplx", "content": 344.45454539575513, "timestamp": "2025-09-04 03:56:50.960462", "step": 1300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:56:51.033210", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.00863197073340416, "timestamp": "2025-09-04 03:56:51.048203", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:51.152787", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.03207479417324066, "timestamp": "2025-09-04 03:56:51.171776", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:56:51.272855", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.03501874580979347, "timestamp": "2025-09-04 03:56:51.291496", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:56:51.395438", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.040076903998851776, "timestamp": "2025-09-04 03:56:51.415398", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:51.507169", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.06259066611528397, "timestamp": "2025-09-04 03:56:51.526093", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:51.621169", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.005843855440616608, "timestamp": "2025-09-04 03:56:51.638364", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:51.746720", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.016627954319119453, "timestamp": "2025-09-04 03:56:51.766813", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:56:51.870051", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.01708150841295719, "timestamp": "2025-09-04 03:56:51.889764", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:56:51.993489", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.02616330236196518, "timestamp": "2025-09-04 03:56:52.015383", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:56:52.110091", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.0031908308155834675, "timestamp": "2025-09-04 03:56:52.127233", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:56:52.199864", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.03640008345246315, "timestamp": "2025-09-04 03:56:52.212599", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:52.308612", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.009561690501868725, "timestamp": "2025-09-04 03:56:52.326635", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:56:52.424790", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.04150993376970291, "timestamp": "2025-09-04 03:56:52.445011", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:56:52.536780", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.019684717059135437, "timestamp": "2025-09-04 03:56:52.552190", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:56:52.661011", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.011558826081454754, "timestamp": "2025-09-04 03:56:52.681084", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:56:52.767800", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.01043105311691761, "timestamp": "2025-09-04 03:56:52.783965", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:56:52.877263", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.02803068794310093, "timestamp": "2025-09-04 03:56:52.896318", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:56:52.973511", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.09199260920286179, "timestamp": "2025-09-04 03:56:52.987265", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:56:53.059699", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.00931278895586729, "timestamp": "2025-09-04 03:56:53.072457", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:56:53.183382", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.008658390492200851, "timestamp": "2025-09-04 03:56:53.204794", "step": 1320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:01.599545", "step": 1320, "epoch": 2 }, { "type": "pplx", "content": 351.6464470611636, "timestamp": "2025-09-04 03:57:01.601996", "step": 1320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1320", "timestamp": "2025-09-04 03:57:02.093639", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:02.198545", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.009841070510447025, "timestamp": "2025-09-04 03:57:02.220801", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1392 ], "flops": 27840169073088.0 }, "timestamp": "2025-09-04 03:57:02.426115", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.016447851434350014, "timestamp": "2025-09-04 03:57:02.465621", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:57:02.576052", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.030630143359303474, "timestamp": "2025-09-04 03:57:02.596668", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 03:57:02.812115", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.03536463528871536, "timestamp": "2025-09-04 03:57:02.853461", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:02.955808", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.008404337801039219, "timestamp": "2025-09-04 03:57:02.976945", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:03.085024", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.02297317609190941, "timestamp": "2025-09-04 03:57:03.104900", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:03.195185", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.0200307946652174, "timestamp": "2025-09-04 03:57:03.212065", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:03.311518", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.006862281356006861, "timestamp": "2025-09-04 03:57:03.330810", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:03.436212", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.13922302424907684, "timestamp": "2025-09-04 03:57:03.458300", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:57:03.567713", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.028604896739125252, "timestamp": "2025-09-04 03:57:03.588279", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:03.690531", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.015768418088555336, "timestamp": "2025-09-04 03:57:03.709730", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:03.803908", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.021719908341765404, "timestamp": "2025-09-04 03:57:03.821947", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:57:03.896565", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.02307227812707424, "timestamp": "2025-09-04 03:57:03.911261", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:04.002837", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.0028728533070534468, "timestamp": "2025-09-04 03:57:04.019376", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:04.122476", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.03315138444304466, "timestamp": "2025-09-04 03:57:04.141454", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:04.236926", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.008455898612737656, "timestamp": "2025-09-04 03:57:04.255020", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:04.361785", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.1121997982263565, "timestamp": "2025-09-04 03:57:04.384050", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:04.497628", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.017756765708327293, "timestamp": "2025-09-04 03:57:04.517733", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:57:04.603610", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.00965754222124815, "timestamp": "2025-09-04 03:57:04.618677", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:57:04.696431", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.02266879379749298, "timestamp": "2025-09-04 03:57:04.711103", "step": 1340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:13.127580", "step": 1340, "epoch": 2 }, { "type": "pplx", "content": 356.8557867994047, "timestamp": "2025-09-04 03:57:13.129660", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:57:13.211956", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.028563033789396286, "timestamp": "2025-09-04 03:57:13.228882", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:57:13.330240", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.04276951029896736, "timestamp": "2025-09-04 03:57:13.349088", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:13.455596", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.017385128885507584, "timestamp": "2025-09-04 03:57:13.475336", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:13.559205", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.009770605713129044, "timestamp": "2025-09-04 03:57:13.574334", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:57:13.650919", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.01318974420428276, "timestamp": "2025-09-04 03:57:13.665861", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:13.774914", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.004364403896033764, "timestamp": "2025-09-04 03:57:13.795071", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:13.899168", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.007132431026548147, "timestamp": "2025-09-04 03:57:13.918237", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:14.009956", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.012063604779541492, "timestamp": "2025-09-04 03:57:14.027233", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:14.125654", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.020999517291784286, "timestamp": "2025-09-04 03:57:14.145819", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:14.239683", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.007064122706651688, "timestamp": "2025-09-04 03:57:14.256533", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:14.359985", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.00644453801214695, "timestamp": "2025-09-04 03:57:14.378894", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:14.465017", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.004426640458405018, "timestamp": "2025-09-04 03:57:14.480574", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:14.586535", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.00213529821485281, "timestamp": "2025-09-04 03:57:14.608254", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:14.703837", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.009575147181749344, "timestamp": "2025-09-04 03:57:14.721141", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:14.812433", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.009818262420594692, "timestamp": "2025-09-04 03:57:14.829234", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:14.907815", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.006910801865160465, "timestamp": "2025-09-04 03:57:14.922554", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:57:15.020511", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.025037406012415886, "timestamp": "2025-09-04 03:57:15.040919", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:15.141088", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.058873310685157776, "timestamp": "2025-09-04 03:57:15.159416", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:15.257132", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.008470847271382809, "timestamp": "2025-09-04 03:57:15.274434", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:15.380421", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.038173649460077286, "timestamp": "2025-09-04 03:57:15.400957", "step": 1360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:23.804518", "step": 1360, "epoch": 2 }, { "type": "pplx", "content": 358.21292183839546, "timestamp": "2025-09-04 03:57:23.806482", "step": 1360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1360", "timestamp": "2025-09-04 03:57:24.379494", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:24.471417", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.003160255728289485, "timestamp": "2025-09-04 03:57:24.490511", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:57:24.600621", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.01291283406317234, "timestamp": "2025-09-04 03:57:24.620887", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:24.712441", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.01215626671910286, "timestamp": "2025-09-04 03:57:24.729005", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 03:57:24.949046", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.007885631173849106, "timestamp": "2025-09-04 03:57:24.992019", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:57:25.085325", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.009847918525338173, "timestamp": "2025-09-04 03:57:25.104269", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:25.212570", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.056854456663131714, "timestamp": "2025-09-04 03:57:25.232642", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 03:57:25.361488", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.05310133844614029, "timestamp": "2025-09-04 03:57:25.384890", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:25.492476", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.014655319042503834, "timestamp": "2025-09-04 03:57:25.512923", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:57:25.595251", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.012807800434529781, "timestamp": "2025-09-04 03:57:25.611697", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:25.714660", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.012346881441771984, "timestamp": "2025-09-04 03:57:25.733731", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:25.812931", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.08287136256694794, "timestamp": "2025-09-04 03:57:25.826896", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:25.934910", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.020002219825983047, "timestamp": "2025-09-04 03:57:25.955901", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:26.064243", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.11615262180566788, "timestamp": "2025-09-04 03:57:26.084428", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:57:26.196826", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.006950510665774345, "timestamp": "2025-09-04 03:57:26.217171", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:26.324298", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.008125067688524723, "timestamp": "2025-09-04 03:57:26.344166", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:57:26.427863", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.027247097343206406, "timestamp": "2025-09-04 03:57:26.443748", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:26.525142", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.0010083671659231186, "timestamp": "2025-09-04 03:57:26.541440", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:26.641242", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.021171875298023224, "timestamp": "2025-09-04 03:57:26.659635", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:57:26.770114", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.05313687399029732, "timestamp": "2025-09-04 03:57:26.790735", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:26.897067", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.009001716040074825, "timestamp": "2025-09-04 03:57:26.917597", "step": 1380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:35.371165", "step": 1380, "epoch": 2 }, { "type": "pplx", "content": 356.8696831217775, "timestamp": "2025-09-04 03:57:35.374916", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:35.479063", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.03470727056264877, "timestamp": "2025-09-04 03:57:35.501369", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:35.598653", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.04004732519388199, "timestamp": "2025-09-04 03:57:35.616139", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:35.722177", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.03036014549434185, "timestamp": "2025-09-04 03:57:35.741436", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:35.849375", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.05383269488811493, "timestamp": "2025-09-04 03:57:35.870112", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:35.963863", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.10220647603273392, "timestamp": "2025-09-04 03:57:35.983063", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:57:36.070880", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.010271705687046051, "timestamp": "2025-09-04 03:57:36.086507", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:36.190060", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.018055927008390427, "timestamp": "2025-09-04 03:57:36.209350", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:36.288183", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.0347399078309536, "timestamp": "2025-09-04 03:57:36.303106", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:57:36.374190", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.017108459025621414, "timestamp": "2025-09-04 03:57:36.388356", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:36.467290", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.019396977499127388, "timestamp": "2025-09-04 03:57:36.481460", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:36.584044", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.029010986909270287, "timestamp": "2025-09-04 03:57:36.603372", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:57:36.688236", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.05589752271771431, "timestamp": "2025-09-04 03:57:36.704586", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:36.795039", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.027025602757930756, "timestamp": "2025-09-04 03:57:36.813941", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:57:36.936119", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.0015675474423915148, "timestamp": "2025-09-04 03:57:36.959466", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:37.044245", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.02568882144987583, "timestamp": "2025-09-04 03:57:37.059306", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:37.158661", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.06579367071390152, "timestamp": "2025-09-04 03:57:37.178098", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:57:37.275495", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.013132071122527122, "timestamp": "2025-09-04 03:57:37.296202", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:57:37.402412", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.00368613563477993, "timestamp": "2025-09-04 03:57:37.422511", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:37.520438", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.027636591345071793, "timestamp": "2025-09-04 03:57:37.539120", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:37.633885", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.01690077967941761, "timestamp": "2025-09-04 03:57:37.652181", "step": 1400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:46.023669", "step": 1400, "epoch": 2 }, { "type": "pplx", "content": 355.6197814395891, "timestamp": "2025-09-04 03:57:46.025721", "step": 1400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1400", "timestamp": "2025-09-04 03:57:46.538383", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:57:46.671119", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.035729728639125824, "timestamp": "2025-09-04 03:57:46.699862", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:46.790258", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.06573422253131866, "timestamp": "2025-09-04 03:57:46.806890", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:57:46.909490", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.014802222140133381, "timestamp": "2025-09-04 03:57:46.928221", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:47.030134", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.03306787088513374, "timestamp": "2025-09-04 03:57:47.049863", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:57:47.133628", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.014069135300815105, "timestamp": "2025-09-04 03:57:47.150471", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:57:47.254620", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.011790101416409016, "timestamp": "2025-09-04 03:57:47.273633", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:57:47.357993", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.036905501037836075, "timestamp": "2025-09-04 03:57:47.373168", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:57:47.490818", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.04053547978401184, "timestamp": "2025-09-04 03:57:47.513458", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:47.610652", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.10574720799922943, "timestamp": "2025-09-04 03:57:47.630819", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:47.726719", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.03774774447083473, "timestamp": "2025-09-04 03:57:47.743637", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:57:47.855271", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.11015975475311279, "timestamp": "2025-09-04 03:57:47.875714", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:47.975623", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.031532928347587585, "timestamp": "2025-09-04 03:57:47.994819", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:48.084984", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.017254313454031944, "timestamp": "2025-09-04 03:57:48.103711", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:57:48.180312", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.12299531698226929, "timestamp": "2025-09-04 03:57:48.194006", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:48.289289", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.011667820625007153, "timestamp": "2025-09-04 03:57:48.306300", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:48.415936", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.06725858896970749, "timestamp": "2025-09-04 03:57:48.435803", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:57:48.528418", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.022478509694337845, "timestamp": "2025-09-04 03:57:48.547300", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:48.656960", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.034010887145996094, "timestamp": "2025-09-04 03:57:48.677147", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:48.775323", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.030013922601938248, "timestamp": "2025-09-04 03:57:48.792730", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:57:48.910608", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.004974214360117912, "timestamp": "2025-09-04 03:57:48.933542", "step": 1420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:57:57.300582", "step": 1420, "epoch": 2 }, { "type": "pplx", "content": 352.19748924586287, "timestamp": "2025-09-04 03:57:57.303117", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 03:57:57.483232", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.004329991061240435, "timestamp": "2025-09-04 03:57:57.521231", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:57.605182", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.012752629816532135, "timestamp": "2025-09-04 03:57:57.620094", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:57:57.737791", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.011430691927671432, "timestamp": "2025-09-04 03:57:57.759708", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:57:57.834971", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.09056932479143143, "timestamp": "2025-09-04 03:57:57.849283", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:57.928832", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.05736755579710007, "timestamp": "2025-09-04 03:57:57.944122", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:57:58.034731", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.010904895141720772, "timestamp": "2025-09-04 03:57:58.051389", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:57:58.161440", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.015301964245736599, "timestamp": "2025-09-04 03:57:58.181793", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:57:58.278459", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.006432659458369017, "timestamp": "2025-09-04 03:57:58.296416", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:57:58.393057", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.04648457467556, "timestamp": "2025-09-04 03:57:58.413285", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:57:58.498896", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.012602617032825947, "timestamp": "2025-09-04 03:57:58.513769", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:57:58.622078", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.00956171378493309, "timestamp": "2025-09-04 03:57:58.642137", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:57:58.736074", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.20023715496063232, "timestamp": "2025-09-04 03:57:58.753934", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:57:58.831296", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.03244839608669281, "timestamp": "2025-09-04 03:57:58.846617", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:57:58.947316", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.019599279388785362, "timestamp": "2025-09-04 03:57:58.965931", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:57:59.083309", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.016808245331048965, "timestamp": "2025-09-04 03:57:59.105217", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:57:59.216073", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.017745893448591232, "timestamp": "2025-09-04 03:57:59.237285", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:59.336156", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.007716418243944645, "timestamp": "2025-09-04 03:57:59.356987", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:59.452348", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.0655415803194046, "timestamp": "2025-09-04 03:57:59.469660", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:57:59.572428", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.039280664175748825, "timestamp": "2025-09-04 03:57:59.591378", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:57:59.686127", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.047219302505254745, "timestamp": "2025-09-04 03:57:59.704167", "step": 1440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:58:08.097411", "step": 1440, "epoch": 2 }, { "type": "pplx", "content": 347.4976195756163, "timestamp": "2025-09-04 03:58:08.099469", "step": 1440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1440", "timestamp": "2025-09-04 03:58:08.613639", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:08.712984", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.022071918472647667, "timestamp": "2025-09-04 03:58:08.733921", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:08.842886", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.02069196105003357, "timestamp": "2025-09-04 03:58:08.862926", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:08.972102", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.001259633689187467, "timestamp": "2025-09-04 03:58:08.992159", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:09.098857", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.05347844213247299, "timestamp": "2025-09-04 03:58:09.119414", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 03:58:09.245780", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.05717499926686287, "timestamp": "2025-09-04 03:58:09.272832", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:09.373325", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.040185511112213135, "timestamp": "2025-09-04 03:58:09.392288", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:09.500710", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.024745700880885124, "timestamp": "2025-09-04 03:58:09.520583", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:09.628530", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.011510748416185379, "timestamp": "2025-09-04 03:58:09.649508", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:58:09.734063", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.014999719336628914, "timestamp": "2025-09-04 03:58:09.750818", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:58:09.861652", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.006128712557256222, "timestamp": "2025-09-04 03:58:09.882168", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:09.985238", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.013433975167572498, "timestamp": "2025-09-04 03:58:10.004317", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:58:10.081730", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.007306399755179882, "timestamp": "2025-09-04 03:58:10.096553", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:58:10.188094", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.004065982531756163, "timestamp": "2025-09-04 03:58:10.207086", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:10.307494", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.005130627192556858, "timestamp": "2025-09-04 03:58:10.326129", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:58:10.404524", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.01915198192000389, "timestamp": "2025-09-04 03:58:10.418656", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:10.519421", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.006536707282066345, "timestamp": "2025-09-04 03:58:10.538875", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:58:10.645308", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.029277930036187172, "timestamp": "2025-09-04 03:58:10.667816", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:58:10.784355", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.01957911066710949, "timestamp": "2025-09-04 03:58:10.806480", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:58:10.916254", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.009834788739681244, "timestamp": "2025-09-04 03:58:10.936503", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:11.044736", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.040386419743299484, "timestamp": "2025-09-04 03:58:11.065853", "step": 1460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:58:19.451938", "step": 1460, "epoch": 2 }, { "type": "pplx", "content": 348.4467862100084, "timestamp": "2025-09-04 03:58:19.453968", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:58:19.549185", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.05862518399953842, "timestamp": "2025-09-04 03:58:19.569593", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:58:19.653427", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.010720998048782349, "timestamp": "2025-09-04 03:58:19.668367", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:19.769165", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.0041922107338905334, "timestamp": "2025-09-04 03:58:19.788193", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:58:19.871897", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.02185150980949402, "timestamp": "2025-09-04 03:58:19.887593", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:19.978920", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.02324114739894867, "timestamp": "2025-09-04 03:58:19.997814", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:58:20.089399", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.01927514187991619, "timestamp": "2025-09-04 03:58:20.105963", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:58:20.200168", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.029533013701438904, "timestamp": "2025-09-04 03:58:20.217003", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:20.321673", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.06191162019968033, "timestamp": "2025-09-04 03:58:20.341487", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:20.444498", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.008275226689875126, "timestamp": "2025-09-04 03:58:20.466423", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:20.562481", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.020667975768446922, "timestamp": "2025-09-04 03:58:20.579977", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:58:20.653299", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.023650793358683586, "timestamp": "2025-09-04 03:58:20.665998", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:20.767543", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.02776520885527134, "timestamp": "2025-09-04 03:58:20.786811", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:20.864142", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.01794314943253994, "timestamp": "2025-09-04 03:58:20.878780", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:20.990087", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.028163762763142586, "timestamp": "2025-09-04 03:58:21.009697", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:21.118248", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.016635507345199585, "timestamp": "2025-09-04 03:58:21.137598", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:58:21.223856", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.09255839139223099, "timestamp": "2025-09-04 03:58:21.239630", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:21.338871", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.03798213601112366, "timestamp": "2025-09-04 03:58:21.359255", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:21.460978", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.02084563672542572, "timestamp": "2025-09-04 03:58:21.479589", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:58:21.563333", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.009020745754241943, "timestamp": "2025-09-04 03:58:21.578184", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:21.688017", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.0042407093569636345, "timestamp": "2025-09-04 03:58:21.708855", "step": 1480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:58:30.096181", "step": 1480, "epoch": 2 }, { "type": "pplx", "content": 350.9113278716737, "timestamp": "2025-09-04 03:58:30.098321", "step": 1480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1480", "timestamp": "2025-09-04 03:58:30.453234", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:30.528236", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.042050596326589584, "timestamp": "2025-09-04 03:58:30.543158", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:30.651692", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.023750372231006622, "timestamp": "2025-09-04 03:58:30.672015", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:30.776255", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.008751154877245426, "timestamp": "2025-09-04 03:58:30.795315", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:30.872932", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.013106334023177624, "timestamp": "2025-09-04 03:58:30.887472", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 03:58:31.005957", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.029924411326646805, "timestamp": "2025-09-04 03:58:31.029590", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:58:31.130014", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.024293435737490654, "timestamp": "2025-09-04 03:58:31.148274", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:31.257554", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.06716310232877731, "timestamp": "2025-09-04 03:58:31.277164", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:31.384124", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.02108912356197834, "timestamp": "2025-09-04 03:58:31.404686", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:31.481010", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.07503022998571396, "timestamp": "2025-09-04 03:58:31.496061", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:58:31.581333", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.0028276981320232153, "timestamp": "2025-09-04 03:58:31.596330", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:58:31.689572", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.02281711809337139, "timestamp": "2025-09-04 03:58:31.706718", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:58:31.816423", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.026799194514751434, "timestamp": "2025-09-04 03:58:31.837569", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:58:31.945511", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.027529401704669, "timestamp": "2025-09-04 03:58:31.968207", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:32.077972", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.0032774689607322216, "timestamp": "2025-09-04 03:58:32.097948", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:32.193448", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.03091510199010372, "timestamp": "2025-09-04 03:58:32.210579", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:32.289594", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.018100092187523842, "timestamp": "2025-09-04 03:58:32.303714", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:32.405510", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.06709955632686615, "timestamp": "2025-09-04 03:58:32.425599", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:32.526213", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.016714708879590034, "timestamp": "2025-09-04 03:58:32.543025", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:58:32.641548", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.009932179003953934, "timestamp": "2025-09-04 03:58:32.658618", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:58:32.771050", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.002495410619303584, "timestamp": "2025-09-04 03:58:32.792042", "step": 1500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:58:41.271072", "step": 1500, "epoch": 2 }, { "type": "pplx", "content": 352.31429226333887, "timestamp": "2025-09-04 03:58:41.273723", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:58:41.354514", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.11568693816661835, "timestamp": "2025-09-04 03:58:41.371425", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:41.475740", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.01670590229332447, "timestamp": "2025-09-04 03:58:41.494996", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:58:41.589828", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.019445618614554405, "timestamp": "2025-09-04 03:58:41.606867", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:58:41.685355", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.057172223925590515, "timestamp": "2025-09-04 03:58:41.700285", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:41.800836", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.00811680406332016, "timestamp": "2025-09-04 03:58:41.821852", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:41.904781", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.04069969430565834, "timestamp": "2025-09-04 03:58:41.918645", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:58:42.006065", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.011170770972967148, "timestamp": "2025-09-04 03:58:42.019930", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:42.124593", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.011762701906263828, "timestamp": "2025-09-04 03:58:42.144303", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:42.250667", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.004596356768161058, "timestamp": "2025-09-04 03:58:42.272663", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:42.379939", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.005542484112083912, "timestamp": "2025-09-04 03:58:42.399644", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:42.504187", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.048978500068187714, "timestamp": "2025-09-04 03:58:42.523284", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:42.627086", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.0095089515671134, "timestamp": "2025-09-04 03:58:42.646752", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:42.745841", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.012734484858810902, "timestamp": "2025-09-04 03:58:42.766995", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 03:58:42.971571", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.0094110993668437, "timestamp": "2025-09-04 03:58:43.010715", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:43.087980", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.006989278364926577, "timestamp": "2025-09-04 03:58:43.101850", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:43.202917", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.03085504285991192, "timestamp": "2025-09-04 03:58:43.222336", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:58:43.320624", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.0065007261000573635, "timestamp": "2025-09-04 03:58:43.340918", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:43.443356", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.020510029047727585, "timestamp": "2025-09-04 03:58:43.462464", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:58:43.561904", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.010851016268134117, "timestamp": "2025-09-04 03:58:43.580231", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:58:43.681348", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.011947129853069782, "timestamp": "2025-09-04 03:58:43.700623", "step": 1520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:58:52.103185", "step": 1520, "epoch": 2 }, { "type": "pplx", "content": 354.340881585885, "timestamp": "2025-09-04 03:58:52.104933", "step": 1520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1520", "timestamp": "2025-09-04 03:58:52.596824", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:52.671295", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.004985039122402668, "timestamp": "2025-09-04 03:58:52.686417", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:52.763053", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.04416259378194809, "timestamp": "2025-09-04 03:58:52.776998", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:58:52.885793", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.07206206768751144, "timestamp": "2025-09-04 03:58:52.906087", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:58:52.999974", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.011458688415586948, "timestamp": "2025-09-04 03:58:53.017860", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:53.116338", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.014014442451298237, "timestamp": "2025-09-04 03:58:53.137043", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:53.238547", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.007480642758309841, "timestamp": "2025-09-04 03:58:53.257330", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 03:58:53.477272", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.0391855351626873, "timestamp": "2025-09-04 03:58:53.519609", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:58:53.622364", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.002516398439183831, "timestamp": "2025-09-04 03:58:53.642357", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:58:53.717657", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.028322208672761917, "timestamp": "2025-09-04 03:58:53.732958", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1200 ], "flops": 24000145761984.0 }, "timestamp": "2025-09-04 03:58:53.908721", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.0025973438750952482, "timestamp": "2025-09-04 03:58:53.941662", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:54.036743", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.06388135254383087, "timestamp": "2025-09-04 03:58:54.054194", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:58:54.160713", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.006396736484020948, "timestamp": "2025-09-04 03:58:54.181459", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:58:54.286647", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.028667569160461426, "timestamp": "2025-09-04 03:58:54.308867", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:54.410070", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.11603422462940216, "timestamp": "2025-09-04 03:58:54.428882", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:54.534808", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.011094557121396065, "timestamp": "2025-09-04 03:58:54.553867", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:58:54.757887", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.02706741727888584, "timestamp": "2025-09-04 03:58:54.797893", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:58:54.892179", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.01568935438990593, "timestamp": "2025-09-04 03:58:54.911469", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:58:54.995117", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.009411556646227837, "timestamp": "2025-09-04 03:58:55.010256", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:58:55.118774", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.01080233883112669, "timestamp": "2025-09-04 03:58:55.138032", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:58:55.237800", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.016464075073599815, "timestamp": "2025-09-04 03:58:55.257398", "step": 1540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:03.697121", "step": 1540, "epoch": 2 }, { "type": "pplx", "content": 358.13731037704423, "timestamp": "2025-09-04 03:59:03.699487", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:03.800069", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.017000077292323112, "timestamp": "2025-09-04 03:59:03.821025", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:59:03.926237", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.018205707892775536, "timestamp": "2025-09-04 03:59:03.945068", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:59:04.069382", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.032110054045915604, "timestamp": "2025-09-04 03:59:04.091913", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:04.202077", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.0027143440674990416, "timestamp": "2025-09-04 03:59:04.222663", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:59:04.332288", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.030216865241527557, "timestamp": "2025-09-04 03:59:04.354365", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:04.465187", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.03675489500164986, "timestamp": "2025-09-04 03:59:04.484744", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:59:04.562572", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.03915338218212128, "timestamp": "2025-09-04 03:59:04.576156", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:59:04.714903", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.0727081373333931, "timestamp": "2025-09-04 03:59:04.741282", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:59:04.819762", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.03566645830869675, "timestamp": "2025-09-04 03:59:04.834687", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:04.939334", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.020596234127879143, "timestamp": "2025-09-04 03:59:04.957967", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:05.060642", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.00818456057459116, "timestamp": "2025-09-04 03:59:05.078828", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:05.189583", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.05260307341814041, "timestamp": "2025-09-04 03:59:05.210406", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:05.313980", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.006511691492050886, "timestamp": "2025-09-04 03:59:05.334487", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:05.440815", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.009990708902478218, "timestamp": "2025-09-04 03:59:05.459864", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 03:59:05.636814", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.004030467942357063, "timestamp": "2025-09-04 03:59:05.668857", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:59:05.756761", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.031922031193971634, "timestamp": "2025-09-04 03:59:05.772959", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:05.865809", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.021661344915628433, "timestamp": "2025-09-04 03:59:05.884706", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:05.979801", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.01415644958615303, "timestamp": "2025-09-04 03:59:05.996754", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:06.091347", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.0253834780305624, "timestamp": "2025-09-04 03:59:06.108290", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:06.210438", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.008007410913705826, "timestamp": "2025-09-04 03:59:06.229849", "step": 1560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:14.729217", "step": 1560, "epoch": 2 }, { "type": "pplx", "content": 359.43894958396004, "timestamp": "2025-09-04 03:59:14.731389", "step": 1560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1560", "timestamp": "2025-09-04 03:59:15.095862", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:15.200111", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.012292720377445221, "timestamp": "2025-09-04 03:59:15.222316", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:15.322245", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.03446212038397789, "timestamp": "2025-09-04 03:59:15.340799", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:15.440100", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.0038441934157162905, "timestamp": "2025-09-04 03:59:15.458765", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:59:15.569481", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.009270180948078632, "timestamp": "2025-09-04 03:59:15.590580", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:15.683992", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.0015130855608731508, "timestamp": "2025-09-04 03:59:15.702558", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:59:15.812496", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.001221196842379868, "timestamp": "2025-09-04 03:59:15.833028", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 03:59:15.917093", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.01908346638083458, "timestamp": "2025-09-04 03:59:15.931975", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:59:16.037182", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.010458694770932198, "timestamp": "2025-09-04 03:59:16.056869", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 03:59:16.254852", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.03718806803226471, "timestamp": "2025-09-04 03:59:16.297722", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:59:16.414907", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.006845841184258461, "timestamp": "2025-09-04 03:59:16.437167", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:16.531656", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.017309976741671562, "timestamp": "2025-09-04 03:59:16.549235", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:16.656486", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.038009658455848694, "timestamp": "2025-09-04 03:59:16.677504", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:16.781767", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.005971661768853664, "timestamp": "2025-09-04 03:59:16.803460", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:59:16.939010", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.0011978168040513992, "timestamp": "2025-09-04 03:59:16.964950", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:59:17.050202", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.06003192439675331, "timestamp": "2025-09-04 03:59:17.065248", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:59:17.155313", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.007113071624189615, "timestamp": "2025-09-04 03:59:17.172663", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:17.269765", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.012526609003543854, "timestamp": "2025-09-04 03:59:17.289917", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 03:59:17.373409", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.05469227954745293, "timestamp": "2025-09-04 03:59:17.388299", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:17.481308", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.020947255194187164, "timestamp": "2025-09-04 03:59:17.498309", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:59:17.570426", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.049049898982048035, "timestamp": "2025-09-04 03:59:17.583920", "step": 1580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:25.958273", "step": 1580, "epoch": 2 }, { "type": "pplx", "content": 357.77819634386793, "timestamp": "2025-09-04 03:59:25.959793", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:59:26.031511", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.022066203877329826, "timestamp": "2025-09-04 03:59:26.046549", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:59:26.124819", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.02584720402956009, "timestamp": "2025-09-04 03:59:26.139065", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:26.233903", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.004149719141423702, "timestamp": "2025-09-04 03:59:26.251416", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:26.344125", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.026423068717122078, "timestamp": "2025-09-04 03:59:26.361696", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:26.458179", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.014465868473052979, "timestamp": "2025-09-04 03:59:26.478437", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 03:59:26.595337", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.04363749548792839, "timestamp": "2025-09-04 03:59:26.617500", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:26.711320", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.021764691919088364, "timestamp": "2025-09-04 03:59:26.728639", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 03:59:26.866445", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.008350017480552197, "timestamp": "2025-09-04 03:59:26.893339", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:26.983861", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.038341108709573746, "timestamp": "2025-09-04 03:59:27.002454", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:27.096991", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.033245623111724854, "timestamp": "2025-09-04 03:59:27.114289", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:27.214904", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.025423089042305946, "timestamp": "2025-09-04 03:59:27.233747", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 03:59:27.314232", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.07140816748142242, "timestamp": "2025-09-04 03:59:27.328990", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:27.426941", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.06752584129571915, "timestamp": "2025-09-04 03:59:27.447667", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 03:59:27.519642", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.06612498313188553, "timestamp": "2025-09-04 03:59:27.532132", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:27.640395", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.011722175404429436, "timestamp": "2025-09-04 03:59:27.660381", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:27.759366", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.008636832237243652, "timestamp": "2025-09-04 03:59:27.778747", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:59:27.862555", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.015985198318958282, "timestamp": "2025-09-04 03:59:27.879266", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:27.977074", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.011449925601482391, "timestamp": "2025-09-04 03:59:27.994541", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:28.099296", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.005783349275588989, "timestamp": "2025-09-04 03:59:28.117980", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 03:59:28.227054", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.017474643886089325, "timestamp": "2025-09-04 03:59:28.248142", "step": 1600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:36.640768", "step": 1600, "epoch": 2 }, { "type": "pplx", "content": 352.57666281747646, "timestamp": "2025-09-04 03:59:36.643336", "step": 1600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1600", "timestamp": "2025-09-04 03:59:37.160974", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:37.258635", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.004377053584903479, "timestamp": "2025-09-04 03:59:37.278760", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:59:37.357328", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.09427284449338913, "timestamp": "2025-09-04 03:59:37.371013", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:37.468118", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.006842142436653376, "timestamp": "2025-09-04 03:59:37.485384", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:37.593709", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.04602169245481491, "timestamp": "2025-09-04 03:59:37.614356", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:37.717365", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.0032124435529112816, "timestamp": "2025-09-04 03:59:37.738360", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:37.850506", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.0026126240845769644, "timestamp": "2025-09-04 03:59:37.870407", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:59:37.962373", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.0179721862077713, "timestamp": "2025-09-04 03:59:37.979015", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:38.075975", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.03628922253847122, "timestamp": "2025-09-04 03:59:38.094017", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:38.196918", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.037857189774513245, "timestamp": "2025-09-04 03:59:38.217694", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:59:38.295044", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.01975720003247261, "timestamp": "2025-09-04 03:59:38.308225", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:59:38.415102", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.049535252153873444, "timestamp": "2025-09-04 03:59:38.433713", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:38.534698", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.01265799067914486, "timestamp": "2025-09-04 03:59:38.554118", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:59:38.638435", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.09150931984186172, "timestamp": "2025-09-04 03:59:38.655552", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:38.754553", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.007259078789502382, "timestamp": "2025-09-04 03:59:38.773143", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 03:59:38.858195", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.02660290338099003, "timestamp": "2025-09-04 03:59:38.873585", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 03:59:38.995688", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.007390860002487898, "timestamp": "2025-09-04 03:59:39.019381", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:39.113144", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.016929682344198227, "timestamp": "2025-09-04 03:59:39.132006", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:39.238588", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.006743302568793297, "timestamp": "2025-09-04 03:59:39.258325", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:39.373616", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.017765972763299942, "timestamp": "2025-09-04 03:59:39.393491", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:59:39.530286", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.023081377148628235, "timestamp": "2025-09-04 03:59:39.556794", "step": 1620, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:47.940129", "step": 1620, "epoch": 2 }, { "type": "pplx", "content": 344.5752767610482, "timestamp": "2025-09-04 03:59:47.944701", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 03:59:48.061987", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.012214818969368935, "timestamp": "2025-09-04 03:59:48.087346", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:48.182192", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.0025064749643206596, "timestamp": "2025-09-04 03:59:48.199612", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:59:48.334327", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.013956296257674694, "timestamp": "2025-09-04 03:59:48.360138", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 03:59:48.459381", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.02604658156633377, "timestamp": "2025-09-04 03:59:48.478589", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:59:48.592076", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.015420682728290558, "timestamp": "2025-09-04 03:59:48.614371", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:59:48.717389", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.02754286862909794, "timestamp": "2025-09-04 03:59:48.736363", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 03:59:48.812596", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.0014424566179513931, "timestamp": "2025-09-04 03:59:48.826120", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:48.920932", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.015860071405768394, "timestamp": "2025-09-04 03:59:48.938622", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 03:59:49.029533", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.022340765222907066, "timestamp": "2025-09-04 03:59:49.048431", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 03:59:49.141575", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.06079462915658951, "timestamp": "2025-09-04 03:59:49.158388", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:49.259259", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.024208059534430504, "timestamp": "2025-09-04 03:59:49.277857", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:59:49.354899", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.01721222698688507, "timestamp": "2025-09-04 03:59:49.369440", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 03:59:49.462980", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.023982705548405647, "timestamp": "2025-09-04 03:59:49.482064", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:59:49.560407", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.04129756614565849, "timestamp": "2025-09-04 03:59:49.574243", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:49.677225", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.010030495934188366, "timestamp": "2025-09-04 03:59:49.696269", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 03:59:49.798566", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.011695167981088161, "timestamp": "2025-09-04 03:59:49.818213", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 03:59:49.906625", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.038352031260728836, "timestamp": "2025-09-04 03:59:49.924748", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 03:59:50.035679", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.04370134696364403, "timestamp": "2025-09-04 03:59:50.056082", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:50.155714", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.0329529233276844, "timestamp": "2025-09-04 03:59:50.174460", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 03:59:50.260151", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.07019810378551483, "timestamp": "2025-09-04 03:59:50.276294", "step": 1640, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 03:59:58.677353", "step": 1640, "epoch": 2 }, { "type": "pplx", "content": 337.12616741483475, "timestamp": "2025-09-04 03:59:58.679743", "step": 1640, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1640", "timestamp": "2025-09-04 03:59:59.042413", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 03:59:59.118171", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.008293285965919495, "timestamp": "2025-09-04 03:59:59.133120", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 03:59:59.207261", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.032241858541965485, "timestamp": "2025-09-04 03:59:59.220518", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 03:59:59.327128", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.022532925009727478, "timestamp": "2025-09-04 03:59:59.347081", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 03:59:59.456331", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.02694687992334366, "timestamp": "2025-09-04 03:59:59.477507", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 03:59:59.574944", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.04047459363937378, "timestamp": "2025-09-04 03:59:59.595569", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 03:59:59.733413", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.029626868665218353, "timestamp": "2025-09-04 03:59:59.759206", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 03:59:59.863484", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.06880754232406616, "timestamp": "2025-09-04 03:59:59.882549", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 03:59:59.955783", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.01954088918864727, "timestamp": "2025-09-04 03:59:59.969438", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:00.065814", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.03680575639009476, "timestamp": "2025-09-04 04:00:00.085936", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:00.190966", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.05359693989157677, "timestamp": "2025-09-04 04:00:00.210087", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:00:00.285825", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.010023823007941246, "timestamp": "2025-09-04 04:00:00.299422", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:00:00.382070", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.012647191993892193, "timestamp": "2025-09-04 04:00:00.397724", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:00.494834", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.024116551503539085, "timestamp": "2025-09-04 04:00:00.515369", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:00:00.602080", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.012986271642148495, "timestamp": "2025-09-04 04:00:00.617510", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:00:00.704538", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.0540911890566349, "timestamp": "2025-09-04 04:00:00.719990", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:00.820816", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.007742506917566061, "timestamp": "2025-09-04 04:00:00.840248", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:00:00.959478", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.010772481560707092, "timestamp": "2025-09-04 04:00:00.984758", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:00:01.065179", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.02090446837246418, "timestamp": "2025-09-04 04:00:01.079075", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:00:01.186014", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.001743293716572225, "timestamp": "2025-09-04 04:00:01.205789", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 04:00:01.334218", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.010634549893438816, "timestamp": "2025-09-04 04:00:01.359030", "step": 1660, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:00:09.817753", "step": 1660, "epoch": 2 }, { "type": "pplx", "content": 331.6593289744211, "timestamp": "2025-09-04 04:00:09.820816", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:00:09.894815", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.02064441703259945, "timestamp": "2025-09-04 04:00:09.909470", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:00:10.012229", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.015596513636410236, "timestamp": "2025-09-04 04:00:10.031405", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:10.132215", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.011088766157627106, "timestamp": "2025-09-04 04:00:10.150840", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:00:10.273945", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.012567834928631783, "timestamp": "2025-09-04 04:00:10.295058", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:00:10.403246", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.06560499221086502, "timestamp": "2025-09-04 04:00:10.425808", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:10.519387", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.012841533869504929, "timestamp": "2025-09-04 04:00:10.536398", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:00:10.650045", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.003418769920244813, "timestamp": "2025-09-04 04:00:10.670606", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:10.766374", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.02538418211042881, "timestamp": "2025-09-04 04:00:10.784387", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:00:10.884595", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.08917974680662155, "timestamp": "2025-09-04 04:00:10.905588", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:11.005904", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.052654486149549484, "timestamp": "2025-09-04 04:00:11.024342", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:11.123525", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.0234109815210104, "timestamp": "2025-09-04 04:00:11.141882", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:00:11.229789", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.03294301778078079, "timestamp": "2025-09-04 04:00:11.245909", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:00:11.315897", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.01805739291012287, "timestamp": "2025-09-04 04:00:11.329916", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:11.434633", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.01714208535850048, "timestamp": "2025-09-04 04:00:11.453840", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 04:00:11.580861", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.04294885694980621, "timestamp": "2025-09-04 04:00:11.605060", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1040 ], "flops": 20800126336064.0 }, "timestamp": "2025-09-04 04:00:11.755808", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.04310276731848717, "timestamp": "2025-09-04 04:00:11.785965", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:00:11.886494", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.03321857005357742, "timestamp": "2025-09-04 04:00:11.907370", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:00:11.986545", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.014885162003338337, "timestamp": "2025-09-04 04:00:12.000644", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:12.102838", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.00889945961534977, "timestamp": "2025-09-04 04:00:12.121999", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:12.226444", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.0010784993646666408, "timestamp": "2025-09-04 04:00:12.243714", "step": 1680, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:00:20.689461", "step": 1680, "epoch": 2 }, { "type": "pplx", "content": 331.5873085585238, "timestamp": "2025-09-04 04:00:20.691749", "step": 1680, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1680", "timestamp": "2025-09-04 04:00:21.056049", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:21.143665", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.007037813309580088, "timestamp": "2025-09-04 04:00:21.161779", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:21.272043", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.006806929595768452, "timestamp": "2025-09-04 04:00:21.292471", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:21.396953", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.0040665543638169765, "timestamp": "2025-09-04 04:00:21.416216", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:21.506686", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.009878264740109444, "timestamp": "2025-09-04 04:00:21.523945", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:21.615259", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.016732510179281235, "timestamp": "2025-09-04 04:00:21.634022", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:00:21.744343", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.0031748716719448566, "timestamp": "2025-09-04 04:00:21.764249", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:21.856813", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.04727555438876152, "timestamp": "2025-09-04 04:00:21.874028", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:00:21.952413", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.011287051253020763, "timestamp": "2025-09-04 04:00:21.967393", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:22.067839", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.012352891266345978, "timestamp": "2025-09-04 04:00:22.088808", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:00:22.166671", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.041657764464616776, "timestamp": "2025-09-04 04:00:22.180689", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:00:22.258826", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.013096681796014309, "timestamp": "2025-09-04 04:00:22.272579", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:00:22.359084", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.010199989192187786, "timestamp": "2025-09-04 04:00:22.375432", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:22.472488", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.009935924783349037, "timestamp": "2025-09-04 04:00:22.492845", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:22.586149", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.0010472126305103302, "timestamp": "2025-09-04 04:00:22.603478", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:22.707303", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.013101182878017426, "timestamp": "2025-09-04 04:00:22.726713", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:00:22.838549", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.00893877912312746, "timestamp": "2025-09-04 04:00:22.859904", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:22.957935", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.012058167718350887, "timestamp": "2025-09-04 04:00:22.978574", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:23.069815", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.03133353590965271, "timestamp": "2025-09-04 04:00:23.086718", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:23.190329", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.004476282745599747, "timestamp": "2025-09-04 04:00:23.209733", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:23.309449", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.025943979620933533, "timestamp": "2025-09-04 04:00:23.328940", "step": 1700, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:00:31.826010", "step": 1700, "epoch": 2 }, { "type": "pplx", "content": 333.02580516691586, "timestamp": "2025-09-04 04:00:31.828237", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:00:31.926110", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.017599618062376976, "timestamp": "2025-09-04 04:00:31.947097", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:32.040262", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.0015532653778791428, "timestamp": "2025-09-04 04:00:32.057510", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:00:32.143074", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.019918687641620636, "timestamp": "2025-09-04 04:00:32.158506", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:00:32.232768", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.005274395924061537, "timestamp": "2025-09-04 04:00:32.246605", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:32.337510", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.01871996931731701, "timestamp": "2025-09-04 04:00:32.356341", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:00:32.441434", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.04257494583725929, "timestamp": "2025-09-04 04:00:32.456936", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:32.552180", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.020532267168164253, "timestamp": "2025-09-04 04:00:32.569560", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:00:32.645022", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.004226659890264273, "timestamp": "2025-09-04 04:00:32.659438", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:32.766042", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.008803565986454487, "timestamp": "2025-09-04 04:00:32.788565", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:00:32.895625", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.02005820721387863, "timestamp": "2025-09-04 04:00:32.915733", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:33.006906", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.1214195266366005, "timestamp": "2025-09-04 04:00:33.023783", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:33.126759", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.045690055936574936, "timestamp": "2025-09-04 04:00:33.144887", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:00:33.259834", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.0035067156422883272, "timestamp": "2025-09-04 04:00:33.284236", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:33.378429", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.012944525107741356, "timestamp": "2025-09-04 04:00:33.395663", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:33.501202", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.0048626684583723545, "timestamp": "2025-09-04 04:00:33.520594", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:00:33.598180", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.01824316196143627, "timestamp": "2025-09-04 04:00:33.613069", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:33.711266", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.008359821513295174, "timestamp": "2025-09-04 04:00:33.732074", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:00:33.810997", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.056332752108573914, "timestamp": "2025-09-04 04:00:33.824856", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:00:33.959830", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.0033087804913520813, "timestamp": "2025-09-04 04:00:33.985444", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:34.087256", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.0023557273671031, "timestamp": "2025-09-04 04:00:34.106749", "step": 1720, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:00:42.599713", "step": 1720, "epoch": 2 }, { "type": "pplx", "content": 333.25088739723225, "timestamp": "2025-09-04 04:00:42.601861", "step": 1720, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1720", "timestamp": "2025-09-04 04:00:42.959913", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:43.064965", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.03474944829940796, "timestamp": "2025-09-04 04:00:43.087465", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:43.198541", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.005826589651405811, "timestamp": "2025-09-04 04:00:43.218959", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:00:43.339501", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.0146811967715621, "timestamp": "2025-09-04 04:00:43.361337", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:00:43.445239", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.027011031284928322, "timestamp": "2025-09-04 04:00:43.461157", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:43.555904", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.009984579868614674, "timestamp": "2025-09-04 04:00:43.574597", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:43.674494", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.03857394680380821, "timestamp": "2025-09-04 04:00:43.693031", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:43.786677", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.0024763226974755526, "timestamp": "2025-09-04 04:00:43.803892", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:00:43.913846", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.02332633174955845, "timestamp": "2025-09-04 04:00:43.935398", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:44.036422", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.06381595879793167, "timestamp": "2025-09-04 04:00:44.057407", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:44.153790", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.0381832979619503, "timestamp": "2025-09-04 04:00:44.171248", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:44.265710", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.027034031227231026, "timestamp": "2025-09-04 04:00:44.283200", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:44.378747", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.022275667637586594, "timestamp": "2025-09-04 04:00:44.397024", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:00:44.502743", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.11091621220111847, "timestamp": "2025-09-04 04:00:44.525023", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:44.636832", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.0019922046922147274, "timestamp": "2025-09-04 04:00:44.657096", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:44.757668", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.001954772276803851, "timestamp": "2025-09-04 04:00:44.776302", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:44.879280", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.0044233831577003, "timestamp": "2025-09-04 04:00:44.898969", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:00:44.981562", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.0020844575483351946, "timestamp": "2025-09-04 04:00:44.998177", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:45.088833", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.021219972521066666, "timestamp": "2025-09-04 04:00:45.105675", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:45.200766", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.0035940525121986866, "timestamp": "2025-09-04 04:00:45.218204", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:45.322209", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.004270992241799831, "timestamp": "2025-09-04 04:00:45.342196", "step": 1740, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:00:53.844925", "step": 1740, "epoch": 2 }, { "type": "pplx", "content": 332.1656256388727, "timestamp": "2025-09-04 04:00:53.848701", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:53.945874", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.011263997294008732, "timestamp": "2025-09-04 04:00:53.966713", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 04:00:54.186951", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.0082445302978158, "timestamp": "2025-09-04 04:00:54.229291", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:54.333877", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.015566142275929451, "timestamp": "2025-09-04 04:00:54.353256", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:54.446192", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.007874183356761932, "timestamp": "2025-09-04 04:00:54.464227", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:00:54.538554", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.03222401440143585, "timestamp": "2025-09-04 04:00:54.553386", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:54.665546", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.005354198161512613, "timestamp": "2025-09-04 04:00:54.686098", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:00:54.773362", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.02252938598394394, "timestamp": "2025-09-04 04:00:54.789032", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:54.880684", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.016836240887641907, "timestamp": "2025-09-04 04:00:54.898361", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:00:55.003472", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.060379642993211746, "timestamp": "2025-09-04 04:00:55.025325", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:00:55.127851", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.00034839441650547087, "timestamp": "2025-09-04 04:00:55.147125", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:00:55.241660", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.006921238731592894, "timestamp": "2025-09-04 04:00:55.259134", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:00:55.355817", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.0024503259919583797, "timestamp": "2025-09-04 04:00:55.373415", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:00:55.474861", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.013867728412151337, "timestamp": "2025-09-04 04:00:55.495862", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:55.593558", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.012754159979522228, "timestamp": "2025-09-04 04:00:55.611128", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:00:55.720755", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.016128217801451683, "timestamp": "2025-09-04 04:00:55.741196", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:00:55.832248", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.002366261789575219, "timestamp": "2025-09-04 04:00:55.849776", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:00:55.931287", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.012999413534998894, "timestamp": "2025-09-04 04:00:55.947967", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:00:56.042427", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.011377224698662758, "timestamp": "2025-09-04 04:00:56.059803", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:00:56.160358", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.028042022138834, "timestamp": "2025-09-04 04:00:56.179270", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:00:56.280616", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.07089690864086151, "timestamp": "2025-09-04 04:00:56.299883", "step": 1760, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:01:04.797965", "step": 1760, "epoch": 2 }, { "type": "pplx", "content": 331.5460997167647, "timestamp": "2025-09-04 04:01:04.799966", "step": 1760, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1760", "timestamp": "2025-09-04 04:01:05.165288", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:01:05.245129", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.005329194013029337, "timestamp": "2025-09-04 04:01:05.261502", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:05.367704", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.0019212173065170646, "timestamp": "2025-09-04 04:01:05.387739", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:05.482905", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.011046413332223892, "timestamp": "2025-09-04 04:01:05.500152", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:01:05.603684", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.02376765012741089, "timestamp": "2025-09-04 04:01:05.623816", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:05.765786", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.02195976860821247, "timestamp": "2025-09-04 04:01:05.786601", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:01:05.872683", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.06642767786979675, "timestamp": "2025-09-04 04:01:05.887982", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:01:06.093441", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.0038195352535694838, "timestamp": "2025-09-04 04:01:06.132739", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:01:06.245766", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.026749003678560257, "timestamp": "2025-09-04 04:01:06.267150", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:01:06.342459", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.032609354704618454, "timestamp": "2025-09-04 04:01:06.357861", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:01:06.472022", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.02017885632812977, "timestamp": "2025-09-04 04:01:06.492669", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:01:06.570069", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.01600290834903717, "timestamp": "2025-09-04 04:01:06.583952", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:01:06.656885", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.0048013306222856045, "timestamp": "2025-09-04 04:01:06.670727", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:01:06.792023", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.011499549262225628, "timestamp": "2025-09-04 04:01:06.817313", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:01:06.894357", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.007216259371489286, "timestamp": "2025-09-04 04:01:06.908347", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 04:01:07.041806", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.040591221302747726, "timestamp": "2025-09-04 04:01:07.066436", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:07.165680", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.030231038108468056, "timestamp": "2025-09-04 04:01:07.185187", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:07.278526", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.04355088993906975, "timestamp": "2025-09-04 04:01:07.297576", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:07.390198", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.010329993441700935, "timestamp": "2025-09-04 04:01:07.407455", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:01:07.494403", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.06678889691829681, "timestamp": "2025-09-04 04:01:07.510066", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:07.603681", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.03957043215632439, "timestamp": "2025-09-04 04:01:07.621544", "step": 1780, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:01:16.109115", "step": 1780, "epoch": 2 }, { "type": "pplx", "content": 332.86792094420156, "timestamp": "2025-09-04 04:01:16.111280", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:01:16.214758", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.02120288833975792, "timestamp": "2025-09-04 04:01:16.237163", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:16.331514", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.07391846925020218, "timestamp": "2025-09-04 04:01:16.348990", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:01:16.454583", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.003579053794965148, "timestamp": "2025-09-04 04:01:16.473828", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:01:16.583998", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.00399737898260355, "timestamp": "2025-09-04 04:01:16.604981", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:01:16.709762", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.06767401099205017, "timestamp": "2025-09-04 04:01:16.732053", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:16.839280", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.015294116921722889, "timestamp": "2025-09-04 04:01:16.859246", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:16.966615", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.056523002684116364, "timestamp": "2025-09-04 04:01:16.986545", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:17.093325", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.031245263293385506, "timestamp": "2025-09-04 04:01:17.114048", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:01:17.189344", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.013243497349321842, "timestamp": "2025-09-04 04:01:17.204416", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:01:17.286843", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.008389891125261784, "timestamp": "2025-09-04 04:01:17.302064", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:01:17.396893", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.07410303503274918, "timestamp": "2025-09-04 04:01:17.414517", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:01:17.492341", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.04840013012290001, "timestamp": "2025-09-04 04:01:17.507244", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:01:17.596637", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.033458348363637924, "timestamp": "2025-09-04 04:01:17.615173", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:01:17.700214", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.0031342965085059404, "timestamp": "2025-09-04 04:01:17.715288", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:01:17.823263", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.004811432678252459, "timestamp": "2025-09-04 04:01:17.843383", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:17.943652", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.060861192643642426, "timestamp": "2025-09-04 04:01:17.963035", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:01:18.038096", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.0050119198858737946, "timestamp": "2025-09-04 04:01:18.053478", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:01:18.130769", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.03676461800932884, "timestamp": "2025-09-04 04:01:18.144495", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:01:18.220106", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.014785559847950935, "timestamp": "2025-09-04 04:01:18.233902", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:01:18.317658", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.008279402740299702, "timestamp": "2025-09-04 04:01:18.333743", "step": 1800, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:01:26.804956", "step": 1800, "epoch": 2 }, { "type": "pplx", "content": 334.6393289394288, "timestamp": "2025-09-04 04:01:26.807267", "step": 1800, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1800", "timestamp": "2025-09-04 04:01:27.163015", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:01:27.279145", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.031706344336271286, "timestamp": "2025-09-04 04:01:27.302897", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:01:27.406870", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.010657178238034248, "timestamp": "2025-09-04 04:01:27.426238", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:27.524874", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.006731742527335882, "timestamp": "2025-09-04 04:01:27.543242", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:01:27.654231", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.010640786960721016, "timestamp": "2025-09-04 04:01:27.675378", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:27.767989", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.0017573751974850893, "timestamp": "2025-09-04 04:01:27.786871", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:01:27.861394", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.0030143826734274626, "timestamp": "2025-09-04 04:01:27.874925", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:01:27.988082", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.05524804815649986, "timestamp": "2025-09-04 04:01:28.008638", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:01:28.111875", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.0070555852726101875, "timestamp": "2025-09-04 04:01:28.131855", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:28.237339", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.0052083320915699005, "timestamp": "2025-09-04 04:01:28.259221", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:01:28.335457", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.006322692148387432, "timestamp": "2025-09-04 04:01:28.348911", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:28.444431", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.0009704896947368979, "timestamp": "2025-09-04 04:01:28.461713", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:28.563516", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.032470621168613434, "timestamp": "2025-09-04 04:01:28.582668", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:28.690265", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.09668878465890884, "timestamp": "2025-09-04 04:01:28.711902", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:01:28.825233", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.043325960636138916, "timestamp": "2025-09-04 04:01:28.845811", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:01:28.919550", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.03765127435326576, "timestamp": "2025-09-04 04:01:28.932457", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:29.027274", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.0037220031954348087, "timestamp": "2025-09-04 04:01:29.045400", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:29.138382", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.010741024278104305, "timestamp": "2025-09-04 04:01:29.157429", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:01:29.260861", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.008968895301222801, "timestamp": "2025-09-04 04:01:29.279763", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:29.373806", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.044184453785419464, "timestamp": "2025-09-04 04:01:29.390644", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:01:29.487551", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.000747227284591645, "timestamp": "2025-09-04 04:01:29.505764", "step": 1820, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:01:37.982577", "step": 1820, "epoch": 2 }, { "type": "pplx", "content": 334.53241911357134, "timestamp": "2025-09-04 04:01:37.984505", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:01:38.089414", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.0014711732510477304, "timestamp": "2025-09-04 04:01:38.112040", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:38.205609", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.010439584963023663, "timestamp": "2025-09-04 04:01:38.222944", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:38.323038", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.012518877163529396, "timestamp": "2025-09-04 04:01:38.341806", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:01:38.444098", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.04167770966887474, "timestamp": "2025-09-04 04:01:38.463964", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:38.555702", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.005846615415066481, "timestamp": "2025-09-04 04:01:38.574612", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:38.675901", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.026406852528452873, "timestamp": "2025-09-04 04:01:38.694683", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 04:01:38.758694", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.015116888098418713, "timestamp": "2025-09-04 04:01:38.769907", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:01:38.887154", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.09362763166427612, "timestamp": "2025-09-04 04:01:38.909974", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:01:38.999028", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.000753458240069449, "timestamp": "2025-09-04 04:01:39.017403", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:01:39.097095", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.021402668207883835, "timestamp": "2025-09-04 04:01:39.111346", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:39.213078", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.005958658177405596, "timestamp": "2025-09-04 04:01:39.231929", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:39.333890", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.051472008228302, "timestamp": "2025-09-04 04:01:39.353219", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:01:39.437888", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.030274610966444016, "timestamp": "2025-09-04 04:01:39.454964", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:01:39.562661", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.007288443390280008, "timestamp": "2025-09-04 04:01:39.582323", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:39.676731", "step": 1834, "epoch": 2 }, { "type": "loss", "content": 0.006486637983471155, "timestamp": "2025-09-04 04:01:39.693979", "step": 1835, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:39.793975", "step": 1835, "epoch": 2 }, { "type": "loss", "content": 0.0005186740309000015, "timestamp": "2025-09-04 04:01:39.813264", "step": 1836, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:39.904684", "step": 1836, "epoch": 2 }, { "type": "loss", "content": 0.004083861596882343, "timestamp": "2025-09-04 04:01:39.923216", "step": 1837, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:01:40.025797", "step": 1837, "epoch": 2 }, { "type": "loss", "content": 0.018782733008265495, "timestamp": "2025-09-04 04:01:40.045011", "step": 1838, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:01:40.121633", "step": 1838, "epoch": 2 }, { "type": "loss", "content": 0.004725904669612646, "timestamp": "2025-09-04 04:01:40.135403", "step": 1839, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:01:40.226335", "step": 1839, "epoch": 2 }, { "type": "loss", "content": 0.027214346453547478, "timestamp": "2025-09-04 04:01:40.243999", "step": 1840, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:01:48.732766", "step": 1840, "epoch": 2 }, { "type": "pplx", "content": 332.6223443151369, "timestamp": "2025-09-04 04:01:48.735049", "step": 1840, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1840", "timestamp": "2025-09-04 04:01:49.155872", "step": 1840, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:01:49.243365", "step": 1840, "epoch": 2 }, { "type": "loss", "content": 0.00866016000509262, "timestamp": "2025-09-04 04:01:49.260285", "step": 1841, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:01:49.383488", "step": 1841, "epoch": 2 }, { "type": "loss", "content": 0.02382933534681797, "timestamp": "2025-09-04 04:01:49.406702", "step": 1842, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:49.501505", "step": 1842, "epoch": 2 }, { "type": "loss", "content": 0.018402768298983574, "timestamp": "2025-09-04 04:01:49.518813", "step": 1843, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:01:49.615531", "step": 1843, "epoch": 2 }, { "type": "loss", "content": 0.0033138843718916178, "timestamp": "2025-09-04 04:01:49.633725", "step": 1844, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:01:49.765165", "step": 1844, "epoch": 2 }, { "type": "loss", "content": 0.014491533860564232, "timestamp": "2025-09-04 04:01:49.793552", "step": 1845, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:49.893752", "step": 1845, "epoch": 2 }, { "type": "loss", "content": 0.06931175291538239, "timestamp": "2025-09-04 04:01:49.912401", "step": 1846, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:50.006776", "step": 1846, "epoch": 2 }, { "type": "loss", "content": 0.04427943378686905, "timestamp": "2025-09-04 04:01:50.023915", "step": 1847, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:01:50.132323", "step": 1847, "epoch": 2 }, { "type": "loss", "content": 0.012512077577412128, "timestamp": "2025-09-04 04:01:50.153223", "step": 1848, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:50.245240", "step": 1848, "epoch": 2 }, { "type": "loss", "content": 0.009834478609263897, "timestamp": "2025-09-04 04:01:50.263906", "step": 1849, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:01:50.369649", "step": 1849, "epoch": 2 }, { "type": "loss", "content": 0.07198784500360489, "timestamp": "2025-09-04 04:01:50.388693", "step": 1850, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:01:50.499515", "step": 1850, "epoch": 2 }, { "type": "loss", "content": 0.031000154092907906, "timestamp": "2025-09-04 04:01:50.519893", "step": 1851, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:01:50.612703", "step": 1851, "epoch": 2 }, { "type": "loss", "content": 0.01771543361246586, "timestamp": "2025-09-04 04:01:50.630038", "step": 1852, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:01:50.751707", "step": 1852, "epoch": 2 }, { "type": "loss", "content": 0.009000863879919052, "timestamp": "2025-09-04 04:01:50.777057", "step": 1853, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:50.871081", "step": 1853, "epoch": 2 }, { "type": "loss", "content": 0.017614608630537987, "timestamp": "2025-09-04 04:01:50.888216", "step": 1854, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:01:50.989658", "step": 1854, "epoch": 2 }, { "type": "loss", "content": 0.0047741541638970375, "timestamp": "2025-09-04 04:01:51.008530", "step": 1855, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:01:51.099179", "step": 1855, "epoch": 2 }, { "type": "loss", "content": 0.029526762664318085, "timestamp": "2025-09-04 04:01:51.116803", "step": 1856, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:01:51.208159", "step": 1856, "epoch": 2 }, { "type": "loss", "content": 0.01259857602417469, "timestamp": "2025-09-04 04:01:51.227097", "step": 1857, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:01:51.305343", "step": 1857, "epoch": 2 }, { "type": "loss", "content": 0.02670917473733425, "timestamp": "2025-09-04 04:01:51.318941", "step": 1858, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:01:51.419359", "step": 1858, "epoch": 2 }, { "type": "loss", "content": 0.0215139277279377, "timestamp": "2025-09-04 04:01:51.437922", "step": 1859, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:01:51.523670", "step": 1859, "epoch": 2 }, { "type": "loss", "content": 0.013091914355754852, "timestamp": "2025-09-04 04:01:51.540128", "step": 1860, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:00.150071", "step": 1860, "epoch": 2 }, { "type": "pplx", "content": 328.99443454458276, "timestamp": "2025-09-04 04:02:00.152296", "step": 1860, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:02:00.234249", "step": 1860, "epoch": 2 }, { "type": "loss", "content": 0.021875588223338127, "timestamp": "2025-09-04 04:02:00.250938", "step": 1861, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:00.341251", "step": 1861, "epoch": 2 }, { "type": "loss", "content": 0.0015288630966097116, "timestamp": "2025-09-04 04:02:00.358039", "step": 1862, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:00.458610", "step": 1862, "epoch": 2 }, { "type": "loss", "content": 0.03855356201529503, "timestamp": "2025-09-04 04:02:00.477318", "step": 1863, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:02:00.588810", "step": 1863, "epoch": 2 }, { "type": "loss", "content": 0.015515622682869434, "timestamp": "2025-09-04 04:02:00.610288", "step": 1864, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:00.699121", "step": 1864, "epoch": 2 }, { "type": "loss", "content": 0.020159801468253136, "timestamp": "2025-09-04 04:02:00.717609", "step": 1865, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:02:00.826565", "step": 1865, "epoch": 2 }, { "type": "loss", "content": 0.0021761921234428883, "timestamp": "2025-09-04 04:02:00.846815", "step": 1866, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:00.957794", "step": 1866, "epoch": 2 }, { "type": "loss", "content": 0.022660445421934128, "timestamp": "2025-09-04 04:02:00.978402", "step": 1867, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:02:01.062716", "step": 1867, "epoch": 2 }, { "type": "loss", "content": 0.004487188998609781, "timestamp": "2025-09-04 04:02:01.078811", "step": 1868, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:01.169919", "step": 1868, "epoch": 2 }, { "type": "loss", "content": 0.034519318491220474, "timestamp": "2025-09-04 04:02:01.189116", "step": 1869, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:01.300376", "step": 1869, "epoch": 2 }, { "type": "loss", "content": 0.0377497524023056, "timestamp": "2025-09-04 04:02:01.320997", "step": 1870, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:01.415723", "step": 1870, "epoch": 2 }, { "type": "loss", "content": 0.015238355845212936, "timestamp": "2025-09-04 04:02:01.433073", "step": 1871, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:01.535777", "step": 1871, "epoch": 2 }, { "type": "loss", "content": 0.012231401167809963, "timestamp": "2025-09-04 04:02:01.555722", "step": 1872, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:02:01.663789", "step": 1872, "epoch": 2 }, { "type": "loss", "content": 0.003201687941327691, "timestamp": "2025-09-04 04:02:01.686618", "step": 1873, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:01.791786", "step": 1873, "epoch": 2 }, { "type": "loss", "content": 0.024663355201482773, "timestamp": "2025-09-04 04:02:01.811100", "step": 1874, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:02:01.904041", "step": 1874, "epoch": 2 }, { "type": "loss", "content": 0.07429840415716171, "timestamp": "2025-09-04 04:02:01.921296", "step": 1875, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:02.021345", "step": 1875, "epoch": 2 }, { "type": "loss", "content": 0.0015133678680285811, "timestamp": "2025-09-04 04:02:02.040982", "step": 1876, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:02.144440", "step": 1876, "epoch": 2 }, { "type": "loss", "content": 0.09027554839849472, "timestamp": "2025-09-04 04:02:02.165531", "step": 1877, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:02.265048", "step": 1877, "epoch": 2 }, { "type": "loss", "content": 0.029700566083192825, "timestamp": "2025-09-04 04:02:02.283427", "step": 1878, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:02:02.355323", "step": 1878, "epoch": 2 }, { "type": "loss", "content": 0.015838583931326866, "timestamp": "2025-09-04 04:02:02.368341", "step": 1879, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:02.463178", "step": 1879, "epoch": 2 }, { "type": "loss", "content": 0.0317532904446125, "timestamp": "2025-09-04 04:02:02.481273", "step": 1880, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:10.983366", "step": 1880, "epoch": 2 }, { "type": "pplx", "content": 326.9121941560908, "timestamp": "2025-09-04 04:02:10.985583", "step": 1880, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1880", "timestamp": "2025-09-04 04:02:11.488696", "step": 1880, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:11.586020", "step": 1880, "epoch": 2 }, { "type": "loss", "content": 0.006327355746179819, "timestamp": "2025-09-04 04:02:11.606788", "step": 1881, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:02:11.679312", "step": 1881, "epoch": 2 }, { "type": "loss", "content": 0.0010608435841277242, "timestamp": "2025-09-04 04:02:11.692168", "step": 1882, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:11.787450", "step": 1882, "epoch": 2 }, { "type": "loss", "content": 0.012697882950305939, "timestamp": "2025-09-04 04:02:11.804854", "step": 1883, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:11.912598", "step": 1883, "epoch": 2 }, { "type": "loss", "content": 0.01790587045252323, "timestamp": "2025-09-04 04:02:11.933353", "step": 1884, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:12.035106", "step": 1884, "epoch": 2 }, { "type": "loss", "content": 0.0027732134331017733, "timestamp": "2025-09-04 04:02:12.056040", "step": 1885, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:02:12.134830", "step": 1885, "epoch": 2 }, { "type": "loss", "content": 0.063087597489357, "timestamp": "2025-09-04 04:02:12.148978", "step": 1886, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:12.256224", "step": 1886, "epoch": 2 }, { "type": "loss", "content": 0.0023315551225095987, "timestamp": "2025-09-04 04:02:12.276242", "step": 1887, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:12.383110", "step": 1887, "epoch": 2 }, { "type": "loss", "content": 0.011332008987665176, "timestamp": "2025-09-04 04:02:12.403326", "step": 1888, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:12.511780", "step": 1888, "epoch": 2 }, { "type": "loss", "content": 0.04571153596043587, "timestamp": "2025-09-04 04:02:12.534517", "step": 1889, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:02:12.617993", "step": 1889, "epoch": 2 }, { "type": "loss", "content": 0.05394493415951729, "timestamp": "2025-09-04 04:02:12.633177", "step": 1890, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:02:12.716784", "step": 1890, "epoch": 2 }, { "type": "loss", "content": 0.007151946425437927, "timestamp": "2025-09-04 04:02:12.732026", "step": 1891, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:12.832763", "step": 1891, "epoch": 2 }, { "type": "loss", "content": 0.03096769005060196, "timestamp": "2025-09-04 04:02:12.852161", "step": 1892, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:12.953361", "step": 1892, "epoch": 2 }, { "type": "loss", "content": 0.019417183473706245, "timestamp": "2025-09-04 04:02:12.974116", "step": 1893, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:02:13.084289", "step": 1893, "epoch": 2 }, { "type": "loss", "content": 0.007123048882931471, "timestamp": "2025-09-04 04:02:13.104493", "step": 1894, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:13.216318", "step": 1894, "epoch": 2 }, { "type": "loss", "content": 0.00961579754948616, "timestamp": "2025-09-04 04:02:13.236746", "step": 1895, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:13.339570", "step": 1895, "epoch": 2 }, { "type": "loss", "content": 0.01128938514739275, "timestamp": "2025-09-04 04:02:13.359169", "step": 1896, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:13.451654", "step": 1896, "epoch": 2 }, { "type": "loss", "content": 0.036590978503227234, "timestamp": "2025-09-04 04:02:13.470687", "step": 1897, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:13.564383", "step": 1897, "epoch": 2 }, { "type": "loss", "content": 0.0244086105376482, "timestamp": "2025-09-04 04:02:13.581880", "step": 1898, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:13.683164", "step": 1898, "epoch": 2 }, { "type": "loss", "content": 0.014544663019478321, "timestamp": "2025-09-04 04:02:13.701976", "step": 1899, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:02:13.780322", "step": 1899, "epoch": 2 }, { "type": "loss", "content": 0.023842498660087585, "timestamp": "2025-09-04 04:02:13.795174", "step": 1900, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:22.378146", "step": 1900, "epoch": 2 }, { "type": "pplx", "content": 327.30657016075753, "timestamp": "2025-09-04 04:02:22.380281", "step": 1900, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:02:22.453274", "step": 1900, "epoch": 2 }, { "type": "loss", "content": 0.036261361092329025, "timestamp": "2025-09-04 04:02:22.467795", "step": 1901, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:22.571889", "step": 1901, "epoch": 2 }, { "type": "loss", "content": 0.0038794318679720163, "timestamp": "2025-09-04 04:02:22.590900", "step": 1902, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:22.697502", "step": 1902, "epoch": 2 }, { "type": "loss", "content": 0.00663451012223959, "timestamp": "2025-09-04 04:02:22.715533", "step": 1903, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:22.813576", "step": 1903, "epoch": 2 }, { "type": "loss", "content": 0.0043860250152647495, "timestamp": "2025-09-04 04:02:22.831587", "step": 1904, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:22.934235", "step": 1904, "epoch": 2 }, { "type": "loss", "content": 0.04079779237508774, "timestamp": "2025-09-04 04:02:22.955046", "step": 1905, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:23.051479", "step": 1905, "epoch": 2 }, { "type": "loss", "content": 0.011518475599586964, "timestamp": "2025-09-04 04:02:23.068728", "step": 1906, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:02:23.156438", "step": 1906, "epoch": 2 }, { "type": "loss", "content": 0.07115134596824646, "timestamp": "2025-09-04 04:02:23.171339", "step": 1907, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:23.267997", "step": 1907, "epoch": 2 }, { "type": "loss", "content": 0.006972004193812609, "timestamp": "2025-09-04 04:02:23.286112", "step": 1908, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:02:23.405084", "step": 1908, "epoch": 2 }, { "type": "loss", "content": 0.02820487506687641, "timestamp": "2025-09-04 04:02:23.428196", "step": 1909, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:23.533819", "step": 1909, "epoch": 2 }, { "type": "loss", "content": 0.021369347348809242, "timestamp": "2025-09-04 04:02:23.552962", "step": 1910, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:23.655938", "step": 1910, "epoch": 2 }, { "type": "loss", "content": 0.031066900119185448, "timestamp": "2025-09-04 04:02:23.673941", "step": 1911, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:02:23.753399", "step": 1911, "epoch": 2 }, { "type": "loss", "content": 0.007914647459983826, "timestamp": "2025-09-04 04:02:23.767444", "step": 1912, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:02:23.850430", "step": 1912, "epoch": 2 }, { "type": "loss", "content": 0.009337898343801498, "timestamp": "2025-09-04 04:02:23.866235", "step": 1913, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:02:23.951961", "step": 1913, "epoch": 2 }, { "type": "loss", "content": 0.015139404684305191, "timestamp": "2025-09-04 04:02:23.966375", "step": 1914, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:24.072580", "step": 1914, "epoch": 2 }, { "type": "loss", "content": 0.020058702677488327, "timestamp": "2025-09-04 04:02:24.091184", "step": 1915, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:24.184250", "step": 1915, "epoch": 2 }, { "type": "loss", "content": 0.01975717768073082, "timestamp": "2025-09-04 04:02:24.201038", "step": 1916, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:24.301071", "step": 1916, "epoch": 2 }, { "type": "loss", "content": 0.03391404077410698, "timestamp": "2025-09-04 04:02:24.321252", "step": 1917, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:02:24.447433", "step": 1917, "epoch": 2 }, { "type": "loss", "content": 0.02063564583659172, "timestamp": "2025-09-04 04:02:24.470101", "step": 1918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:24.576203", "step": 1918, "epoch": 2 }, { "type": "loss", "content": 0.0002946726162917912, "timestamp": "2025-09-04 04:02:24.595051", "step": 1919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:02:24.719215", "step": 1919, "epoch": 2 }, { "type": "loss", "content": 0.005154923070222139, "timestamp": "2025-09-04 04:02:24.742337", "step": 1920, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:33.304746", "step": 1920, "epoch": 2 }, { "type": "pplx", "content": 328.53727900122976, "timestamp": "2025-09-04 04:02:33.308168", "step": 1920, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1920", "timestamp": "2025-09-04 04:02:33.808373", "step": 1920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:02:33.892978", "step": 1920, "epoch": 2 }, { "type": "loss", "content": 0.049568939954042435, "timestamp": "2025-09-04 04:02:33.910033", "step": 1921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:34.011859", "step": 1921, "epoch": 2 }, { "type": "loss", "content": 0.03862825781106949, "timestamp": "2025-09-04 04:02:34.030696", "step": 1922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:02:34.108930", "step": 1922, "epoch": 2 }, { "type": "loss", "content": 0.003806066932156682, "timestamp": "2025-09-04 04:02:34.123154", "step": 1923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:02:34.216966", "step": 1923, "epoch": 2 }, { "type": "loss", "content": 0.020454682409763336, "timestamp": "2025-09-04 04:02:34.234930", "step": 1924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:34.350041", "step": 1924, "epoch": 2 }, { "type": "loss", "content": 0.056430667638778687, "timestamp": "2025-09-04 04:02:34.369848", "step": 1925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:34.501762", "step": 1925, "epoch": 2 }, { "type": "loss", "content": 0.007279661018401384, "timestamp": "2025-09-04 04:02:34.520863", "step": 1926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:34.658902", "step": 1926, "epoch": 2 }, { "type": "loss", "content": 0.011289707385003567, "timestamp": "2025-09-04 04:02:34.676419", "step": 1927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:34.772220", "step": 1927, "epoch": 2 }, { "type": "loss", "content": 0.0045151012018322945, "timestamp": "2025-09-04 04:02:34.790651", "step": 1928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:34.879643", "step": 1928, "epoch": 2 }, { "type": "loss", "content": 0.016711879521608353, "timestamp": "2025-09-04 04:02:34.897944", "step": 1929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:02:34.977184", "step": 1929, "epoch": 2 }, { "type": "loss", "content": 0.011652039363980293, "timestamp": "2025-09-04 04:02:34.991213", "step": 1930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:35.084798", "step": 1930, "epoch": 2 }, { "type": "loss", "content": 0.00244183954782784, "timestamp": "2025-09-04 04:02:35.102275", "step": 1931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:02:35.181891", "step": 1931, "epoch": 2 }, { "type": "loss", "content": 0.004456162918359041, "timestamp": "2025-09-04 04:02:35.196813", "step": 1932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:02:35.294174", "step": 1932, "epoch": 2 }, { "type": "loss", "content": 0.018451880663633347, "timestamp": "2025-09-04 04:02:35.313037", "step": 1933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:35.414679", "step": 1933, "epoch": 2 }, { "type": "loss", "content": 0.011028347536921501, "timestamp": "2025-09-04 04:02:35.434011", "step": 1934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:02:35.571788", "step": 1934, "epoch": 2 }, { "type": "loss", "content": 0.0028660639654845, "timestamp": "2025-09-04 04:02:35.598124", "step": 1935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:35.698343", "step": 1935, "epoch": 2 }, { "type": "loss", "content": 0.03565828502178192, "timestamp": "2025-09-04 04:02:35.718091", "step": 1936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:35.818676", "step": 1936, "epoch": 2 }, { "type": "loss", "content": 0.10143911838531494, "timestamp": "2025-09-04 04:02:35.839976", "step": 1937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:35.940531", "step": 1937, "epoch": 2 }, { "type": "loss", "content": 0.01908188872039318, "timestamp": "2025-09-04 04:02:35.959130", "step": 1938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:36.063139", "step": 1938, "epoch": 2 }, { "type": "loss", "content": 0.018672922626137733, "timestamp": "2025-09-04 04:02:36.082502", "step": 1939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:36.186225", "step": 1939, "epoch": 2 }, { "type": "loss", "content": 0.002945462241768837, "timestamp": "2025-09-04 04:02:36.206294", "step": 1940, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:44.692002", "step": 1940, "epoch": 2 }, { "type": "pplx", "content": 328.19710857649045, "timestamp": "2025-09-04 04:02:44.693795", "step": 1940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:02:44.760706", "step": 1940, "epoch": 2 }, { "type": "loss", "content": 0.006576020736247301, "timestamp": "2025-09-04 04:02:44.774394", "step": 1941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:02:44.852530", "step": 1941, "epoch": 2 }, { "type": "loss", "content": 0.0037395476829260588, "timestamp": "2025-09-04 04:02:44.866793", "step": 1942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:44.966761", "step": 1942, "epoch": 2 }, { "type": "loss", "content": 0.008964164182543755, "timestamp": "2025-09-04 04:02:44.985466", "step": 1943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:45.085746", "step": 1943, "epoch": 2 }, { "type": "loss", "content": 0.010037235915660858, "timestamp": "2025-09-04 04:02:45.105491", "step": 1944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:45.204877", "step": 1944, "epoch": 2 }, { "type": "loss", "content": 0.03502393513917923, "timestamp": "2025-09-04 04:02:45.226011", "step": 1945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:45.335742", "step": 1945, "epoch": 2 }, { "type": "loss", "content": 0.007905172184109688, "timestamp": "2025-09-04 04:02:45.356347", "step": 1946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:45.452352", "step": 1946, "epoch": 2 }, { "type": "loss", "content": 0.023761700838804245, "timestamp": "2025-09-04 04:02:45.470038", "step": 1947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:02:45.579975", "step": 1947, "epoch": 2 }, { "type": "loss", "content": 0.0012213548179715872, "timestamp": "2025-09-04 04:02:45.601147", "step": 1948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:45.693335", "step": 1948, "epoch": 2 }, { "type": "loss", "content": 0.0033678747713565826, "timestamp": "2025-09-04 04:02:45.712436", "step": 1949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:45.807324", "step": 1949, "epoch": 2 }, { "type": "loss", "content": 0.0012253863969817758, "timestamp": "2025-09-04 04:02:45.824862", "step": 1950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:45.933914", "step": 1950, "epoch": 2 }, { "type": "loss", "content": 0.004889811389148235, "timestamp": "2025-09-04 04:02:45.954376", "step": 1951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:46.055612", "step": 1951, "epoch": 2 }, { "type": "loss", "content": 0.025324570015072823, "timestamp": "2025-09-04 04:02:46.074986", "step": 1952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:46.167876", "step": 1952, "epoch": 2 }, { "type": "loss", "content": 0.006361103150993586, "timestamp": "2025-09-04 04:02:46.186888", "step": 1953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:02:46.288522", "step": 1953, "epoch": 2 }, { "type": "loss", "content": 0.019413141533732414, "timestamp": "2025-09-04 04:02:46.307129", "step": 1954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:46.409295", "step": 1954, "epoch": 2 }, { "type": "loss", "content": 0.005714345257729292, "timestamp": "2025-09-04 04:02:46.426613", "step": 1955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:02:46.513059", "step": 1955, "epoch": 2 }, { "type": "loss", "content": 0.056890930980443954, "timestamp": "2025-09-04 04:02:46.529361", "step": 1956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:02:46.603085", "step": 1956, "epoch": 2 }, { "type": "loss", "content": 0.049041591584682465, "timestamp": "2025-09-04 04:02:46.618188", "step": 1957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:02:46.728342", "step": 1957, "epoch": 2 }, { "type": "loss", "content": 0.021011320874094963, "timestamp": "2025-09-04 04:02:46.748819", "step": 1958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:46.843211", "step": 1958, "epoch": 2 }, { "type": "loss", "content": 0.004129297100007534, "timestamp": "2025-09-04 04:02:46.860665", "step": 1959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:46.967025", "step": 1959, "epoch": 2 }, { "type": "loss", "content": 0.0063214851543307304, "timestamp": "2025-09-04 04:02:46.987480", "step": 1960, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:02:55.500629", "step": 1960, "epoch": 2 }, { "type": "pplx", "content": 328.783539244894, "timestamp": "2025-09-04 04:02:55.502583", "step": 1960, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1960", "timestamp": "2025-09-04 04:02:56.015154", "step": 1960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:56.116017", "step": 1960, "epoch": 2 }, { "type": "loss", "content": 0.02819206565618515, "timestamp": "2025-09-04 04:02:56.137065", "step": 1961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:02:56.227546", "step": 1961, "epoch": 2 }, { "type": "loss", "content": 0.0029589931946247816, "timestamp": "2025-09-04 04:02:56.244287", "step": 1962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:56.343431", "step": 1962, "epoch": 2 }, { "type": "loss", "content": 0.01074980664998293, "timestamp": "2025-09-04 04:02:56.362178", "step": 1963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:02:56.461841", "step": 1963, "epoch": 2 }, { "type": "loss", "content": 0.010816739872097969, "timestamp": "2025-09-04 04:02:56.481319", "step": 1964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:02:56.585680", "step": 1964, "epoch": 2 }, { "type": "loss", "content": 0.008484927006065845, "timestamp": "2025-09-04 04:02:56.607894", "step": 1965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:56.704314", "step": 1965, "epoch": 2 }, { "type": "loss", "content": 0.018219899386167526, "timestamp": "2025-09-04 04:02:56.721834", "step": 1966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:02:56.815874", "step": 1966, "epoch": 2 }, { "type": "loss", "content": 0.011172082275152206, "timestamp": "2025-09-04 04:02:56.833246", "step": 1967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:02:56.941573", "step": 1967, "epoch": 2 }, { "type": "loss", "content": 0.015450472943484783, "timestamp": "2025-09-04 04:02:56.962671", "step": 1968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:57.066734", "step": 1968, "epoch": 2 }, { "type": "loss", "content": 0.036945831030607224, "timestamp": "2025-09-04 04:02:57.088685", "step": 1969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:02:57.160350", "step": 1969, "epoch": 2 }, { "type": "loss", "content": 0.0042557851411402225, "timestamp": "2025-09-04 04:02:57.173388", "step": 1970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:02:57.258959", "step": 1970, "epoch": 2 }, { "type": "loss", "content": 0.019967155531048775, "timestamp": "2025-09-04 04:02:57.274551", "step": 1971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:57.381802", "step": 1971, "epoch": 2 }, { "type": "loss", "content": 0.012040197849273682, "timestamp": "2025-09-04 04:02:57.402724", "step": 1972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:02:57.493730", "step": 1972, "epoch": 2 }, { "type": "loss", "content": 0.01980067417025566, "timestamp": "2025-09-04 04:02:57.512481", "step": 1973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:02:57.605693", "step": 1973, "epoch": 2 }, { "type": "loss", "content": 0.008740060031414032, "timestamp": "2025-09-04 04:02:57.622984", "step": 1974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:02:57.727696", "step": 1974, "epoch": 2 }, { "type": "loss", "content": 0.005561890080571175, "timestamp": "2025-09-04 04:02:57.747152", "step": 1975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:02:57.843200", "step": 1975, "epoch": 2 }, { "type": "loss", "content": 0.026673782616853714, "timestamp": "2025-09-04 04:02:57.861480", "step": 1976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:02:57.964421", "step": 1976, "epoch": 2 }, { "type": "loss", "content": 0.06876686215400696, "timestamp": "2025-09-04 04:02:57.986466", "step": 1977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 04:02:58.148841", "step": 1977, "epoch": 2 }, { "type": "loss", "content": 0.005590865388512611, "timestamp": "2025-09-04 04:02:58.180782", "step": 1978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:02:58.264369", "step": 1978, "epoch": 2 }, { "type": "loss", "content": 0.004839139059185982, "timestamp": "2025-09-04 04:02:58.279672", "step": 1979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:02:58.383284", "step": 1979, "epoch": 2 }, { "type": "loss", "content": 0.02411733940243721, "timestamp": "2025-09-04 04:02:58.403072", "step": 1980, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:03:06.910649", "step": 1980, "epoch": 2 }, { "type": "pplx", "content": 331.13313135590505, "timestamp": "2025-09-04 04:03:06.913387", "step": 1980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:03:07.009201", "step": 1980, "epoch": 2 }, { "type": "loss", "content": 0.039461344480514526, "timestamp": "2025-09-04 04:03:07.029715", "step": 1981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:07.132787", "step": 1981, "epoch": 2 }, { "type": "loss", "content": 0.013207260519266129, "timestamp": "2025-09-04 04:03:07.151959", "step": 1982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:07.256472", "step": 1982, "epoch": 2 }, { "type": "loss", "content": 0.029344527050852776, "timestamp": "2025-09-04 04:03:07.275850", "step": 1983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:07.380127", "step": 1983, "epoch": 2 }, { "type": "loss", "content": 0.0013410469982773066, "timestamp": "2025-09-04 04:03:07.400223", "step": 1984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:03:07.491158", "step": 1984, "epoch": 2 }, { "type": "loss", "content": 0.01028304360806942, "timestamp": "2025-09-04 04:03:07.510046", "step": 1985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:03:07.627621", "step": 1985, "epoch": 2 }, { "type": "loss", "content": 0.023101340979337692, "timestamp": "2025-09-04 04:03:07.649726", "step": 1986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:07.741169", "step": 1986, "epoch": 2 }, { "type": "loss", "content": 0.012508483603596687, "timestamp": "2025-09-04 04:03:07.757857", "step": 1987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:03:07.857215", "step": 1987, "epoch": 2 }, { "type": "loss", "content": 0.00880645215511322, "timestamp": "2025-09-04 04:03:07.875535", "step": 1988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:03:07.992983", "step": 1988, "epoch": 2 }, { "type": "loss", "content": 0.00218992680311203, "timestamp": "2025-09-04 04:03:08.016938", "step": 1989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:08.124746", "step": 1989, "epoch": 2 }, { "type": "loss", "content": 0.05591468885540962, "timestamp": "2025-09-04 04:03:08.144878", "step": 1990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:08.223275", "step": 1990, "epoch": 2 }, { "type": "loss", "content": 0.011654703877866268, "timestamp": "2025-09-04 04:03:08.237515", "step": 1991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:03:08.320578", "step": 1991, "epoch": 2 }, { "type": "loss", "content": 0.012211965397000313, "timestamp": "2025-09-04 04:03:08.336518", "step": 1992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:03:08.418294", "step": 1992, "epoch": 2 }, { "type": "loss", "content": 0.006080134306102991, "timestamp": "2025-09-04 04:03:08.435040", "step": 1993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:08.541873", "step": 1993, "epoch": 2 }, { "type": "loss", "content": 0.0031754986848682165, "timestamp": "2025-09-04 04:03:08.561950", "step": 1994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:03:08.697575", "step": 1994, "epoch": 2 }, { "type": "loss", "content": 0.022798430174589157, "timestamp": "2025-09-04 04:03:08.723260", "step": 1995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:03:08.823733", "step": 1995, "epoch": 2 }, { "type": "loss", "content": 0.055928491055965424, "timestamp": "2025-09-04 04:03:08.840166", "step": 1996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:03:08.913556", "step": 1996, "epoch": 2 }, { "type": "loss", "content": 0.006855425424873829, "timestamp": "2025-09-04 04:03:08.928424", "step": 1997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:03:09.000715", "step": 1997, "epoch": 2 }, { "type": "loss", "content": 0.022756323218345642, "timestamp": "2025-09-04 04:03:09.013661", "step": 1998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:03:09.108477", "step": 1998, "epoch": 2 }, { "type": "loss", "content": 0.01630261540412903, "timestamp": "2025-09-04 04:03:09.125996", "step": 1999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:03:09.219144", "step": 1999, "epoch": 2 }, { "type": "loss", "content": 0.00793174933642149, "timestamp": "2025-09-04 04:03:09.237199", "step": 2000, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:03:17.717901", "step": 2000, "epoch": 2 }, { "type": "pplx", "content": 337.7910185199579, "timestamp": "2025-09-04 04:03:17.719822", "step": 2000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-04 04:03:18.081079", "step": 2000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:18.183814", "step": 2000, "epoch": 2 }, { "type": "loss", "content": 0.0038866132963448763, "timestamp": "2025-09-04 04:03:18.205004", "step": 2001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:18.311859", "step": 2001, "epoch": 2 }, { "type": "loss", "content": 0.029996544122695923, "timestamp": "2025-09-04 04:03:18.331766", "step": 2002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:18.412991", "step": 2002, "epoch": 2 }, { "type": "loss", "content": 0.029153253883123398, "timestamp": "2025-09-04 04:03:18.426980", "step": 2003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:18.519253", "step": 2003, "epoch": 2 }, { "type": "loss", "content": 0.059160567820072174, "timestamp": "2025-09-04 04:03:18.536705", "step": 2004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:18.636731", "step": 2004, "epoch": 2 }, { "type": "loss", "content": 0.02167946845293045, "timestamp": "2025-09-04 04:03:18.657389", "step": 2005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:18.761805", "step": 2005, "epoch": 2 }, { "type": "loss", "content": 0.0188386719673872, "timestamp": "2025-09-04 04:03:18.780867", "step": 2006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:18.887731", "step": 2006, "epoch": 2 }, { "type": "loss", "content": 0.0005847454303875566, "timestamp": "2025-09-04 04:03:18.906473", "step": 2007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:18.987245", "step": 2007, "epoch": 2 }, { "type": "loss", "content": 0.008029515855014324, "timestamp": "2025-09-04 04:03:19.002000", "step": 2008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:03:19.077244", "step": 2008, "epoch": 2 }, { "type": "loss", "content": 0.007086843717843294, "timestamp": "2025-09-04 04:03:19.091979", "step": 2009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:19.214193", "step": 2009, "epoch": 2 }, { "type": "loss", "content": 0.008636104874312878, "timestamp": "2025-09-04 04:03:19.233921", "step": 2010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:19.346341", "step": 2010, "epoch": 2 }, { "type": "loss", "content": 0.00841162633150816, "timestamp": "2025-09-04 04:03:19.366348", "step": 2011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:19.478086", "step": 2011, "epoch": 2 }, { "type": "loss", "content": 0.005192655138671398, "timestamp": "2025-09-04 04:03:19.499294", "step": 2012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:03:19.584853", "step": 2012, "epoch": 2 }, { "type": "loss", "content": 0.016467098146677017, "timestamp": "2025-09-04 04:03:19.601656", "step": 2013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:19.714091", "step": 2013, "epoch": 2 }, { "type": "loss", "content": 0.004678299650549889, "timestamp": "2025-09-04 04:03:19.734578", "step": 2014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:19.829432", "step": 2014, "epoch": 2 }, { "type": "loss", "content": 0.038956332951784134, "timestamp": "2025-09-04 04:03:19.846129", "step": 2015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:03:19.946809", "step": 2015, "epoch": 2 }, { "type": "loss", "content": 0.013382869772613049, "timestamp": "2025-09-04 04:03:19.966059", "step": 2016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:20.056506", "step": 2016, "epoch": 2 }, { "type": "loss", "content": 0.006615063641220331, "timestamp": "2025-09-04 04:03:20.074773", "step": 2017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:03:20.154212", "step": 2017, "epoch": 2 }, { "type": "loss", "content": 0.035883184522390366, "timestamp": "2025-09-04 04:03:20.168191", "step": 2018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:20.272520", "step": 2018, "epoch": 2 }, { "type": "loss", "content": 0.0023406874388456345, "timestamp": "2025-09-04 04:03:20.291716", "step": 2019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:20.383383", "step": 2019, "epoch": 2 }, { "type": "loss", "content": 0.09516555815935135, "timestamp": "2025-09-04 04:03:20.400858", "step": 2020, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:03:28.856700", "step": 2020, "epoch": 2 }, { "type": "pplx", "content": 339.743606509487, "timestamp": "2025-09-04 04:03:28.858645", "step": 2020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:03:28.975803", "step": 2020, "epoch": 2 }, { "type": "loss", "content": 0.0009116530418395996, "timestamp": "2025-09-04 04:03:29.001353", "step": 2021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:03:29.087980", "step": 2021, "epoch": 2 }, { "type": "loss", "content": 0.0017411328153684735, "timestamp": "2025-09-04 04:03:29.103549", "step": 2022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:03:29.198141", "step": 2022, "epoch": 2 }, { "type": "loss", "content": 0.018314287066459656, "timestamp": "2025-09-04 04:03:29.215622", "step": 2023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:03:29.316793", "step": 2023, "epoch": 2 }, { "type": "loss", "content": 0.004151246044784784, "timestamp": "2025-09-04 04:03:29.336267", "step": 2024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:29.412528", "step": 2024, "epoch": 2 }, { "type": "loss", "content": 0.03218241408467293, "timestamp": "2025-09-04 04:03:29.428024", "step": 2025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:29.532196", "step": 2025, "epoch": 2 }, { "type": "loss", "content": 0.015404434874653816, "timestamp": "2025-09-04 04:03:29.551399", "step": 2026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:03:29.627359", "step": 2026, "epoch": 2 }, { "type": "loss", "content": 0.057021476328372955, "timestamp": "2025-09-04 04:03:29.641204", "step": 2027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:29.746873", "step": 2027, "epoch": 2 }, { "type": "loss", "content": 0.0008856714703142643, "timestamp": "2025-09-04 04:03:29.767001", "step": 2028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:03:29.841846", "step": 2028, "epoch": 2 }, { "type": "loss", "content": 0.022210262715816498, "timestamp": "2025-09-04 04:03:29.857048", "step": 2029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:29.967779", "step": 2029, "epoch": 2 }, { "type": "loss", "content": 0.005982580129057169, "timestamp": "2025-09-04 04:03:29.988354", "step": 2030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:03:30.074993", "step": 2030, "epoch": 2 }, { "type": "loss", "content": 0.0008112862706184387, "timestamp": "2025-09-04 04:03:30.090723", "step": 2031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 04:03:30.215707", "step": 2031, "epoch": 2 }, { "type": "loss", "content": 0.02068488672375679, "timestamp": "2025-09-04 04:03:30.240475", "step": 2032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:03:30.331022", "step": 2032, "epoch": 2 }, { "type": "loss", "content": 0.03741706535220146, "timestamp": "2025-09-04 04:03:30.349873", "step": 2033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:03:30.432706", "step": 2033, "epoch": 2 }, { "type": "loss", "content": 0.05829369276762009, "timestamp": "2025-09-04 04:03:30.447994", "step": 2034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:03:30.540942", "step": 2034, "epoch": 2 }, { "type": "loss", "content": 0.008479294367134571, "timestamp": "2025-09-04 04:03:30.558435", "step": 2035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:30.668268", "step": 2035, "epoch": 2 }, { "type": "loss", "content": 0.04389806091785431, "timestamp": "2025-09-04 04:03:30.689495", "step": 2036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:30.787055", "step": 2036, "epoch": 2 }, { "type": "loss", "content": 0.02469690330326557, "timestamp": "2025-09-04 04:03:30.807747", "step": 2037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:30.911430", "step": 2037, "epoch": 2 }, { "type": "loss", "content": 0.02199394628405571, "timestamp": "2025-09-04 04:03:30.930735", "step": 2038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:31.033799", "step": 2038, "epoch": 2 }, { "type": "loss", "content": 0.0037276356015354395, "timestamp": "2025-09-04 04:03:31.052730", "step": 2039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:31.131633", "step": 2039, "epoch": 2 }, { "type": "loss", "content": 0.021283473819494247, "timestamp": "2025-09-04 04:03:31.146545", "step": 2040, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:03:39.511875", "step": 2040, "epoch": 2 }, { "type": "pplx", "content": 334.85919534739503, "timestamp": "2025-09-04 04:03:39.513963", "step": 2040, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2040", "timestamp": "2025-09-04 04:03:40.015188", "step": 2040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:40.116989", "step": 2040, "epoch": 2 }, { "type": "loss", "content": 0.02683926559984684, "timestamp": "2025-09-04 04:03:40.138785", "step": 2041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:40.248182", "step": 2041, "epoch": 2 }, { "type": "loss", "content": 0.0386735163629055, "timestamp": "2025-09-04 04:03:40.268736", "step": 2042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:40.369113", "step": 2042, "epoch": 2 }, { "type": "loss", "content": 0.00885379035025835, "timestamp": "2025-09-04 04:03:40.387946", "step": 2043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:03:40.484312", "step": 2043, "epoch": 2 }, { "type": "loss", "content": 0.006810452789068222, "timestamp": "2025-09-04 04:03:40.502525", "step": 2044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:40.590982", "step": 2044, "epoch": 2 }, { "type": "loss", "content": 0.022753320634365082, "timestamp": "2025-09-04 04:03:40.609346", "step": 2045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:03:40.684482", "step": 2045, "epoch": 2 }, { "type": "loss", "content": 0.051130082458257675, "timestamp": "2025-09-04 04:03:40.698277", "step": 2046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:03:40.806236", "step": 2046, "epoch": 2 }, { "type": "loss", "content": 0.028301551938056946, "timestamp": "2025-09-04 04:03:40.826395", "step": 2047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:03:40.942751", "step": 2047, "epoch": 2 }, { "type": "loss", "content": 0.00401369109749794, "timestamp": "2025-09-04 04:03:40.965669", "step": 2048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:03:41.073652", "step": 2048, "epoch": 2 }, { "type": "loss", "content": 0.008323295041918755, "timestamp": "2025-09-04 04:03:41.096381", "step": 2049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:03:41.172951", "step": 2049, "epoch": 2 }, { "type": "loss", "content": 0.010393408127129078, "timestamp": "2025-09-04 04:03:41.186961", "step": 2050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:41.276328", "step": 2050, "epoch": 2 }, { "type": "loss", "content": 0.0038162292912602425, "timestamp": "2025-09-04 04:03:41.293090", "step": 2051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:03:41.402980", "step": 2051, "epoch": 2 }, { "type": "loss", "content": 0.0021734382025897503, "timestamp": "2025-09-04 04:03:41.424184", "step": 2052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:41.522034", "step": 2052, "epoch": 2 }, { "type": "loss", "content": 0.036829832941293716, "timestamp": "2025-09-04 04:03:41.542779", "step": 2053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:03:41.650754", "step": 2053, "epoch": 2 }, { "type": "loss", "content": 0.015699857845902443, "timestamp": "2025-09-04 04:03:41.670987", "step": 2054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:03:41.754688", "step": 2054, "epoch": 2 }, { "type": "loss", "content": 0.014599669724702835, "timestamp": "2025-09-04 04:03:41.769707", "step": 2055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:41.877413", "step": 2055, "epoch": 2 }, { "type": "loss", "content": 0.029209831729531288, "timestamp": "2025-09-04 04:03:41.898633", "step": 2056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:03:41.974188", "step": 2056, "epoch": 2 }, { "type": "loss", "content": 0.022812718525528908, "timestamp": "2025-09-04 04:03:41.989608", "step": 2057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:42.092066", "step": 2057, "epoch": 2 }, { "type": "loss", "content": 0.013986658304929733, "timestamp": "2025-09-04 04:03:42.111323", "step": 2058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:03:42.199588", "step": 2058, "epoch": 2 }, { "type": "loss", "content": 0.055983904749155045, "timestamp": "2025-09-04 04:03:42.215227", "step": 2059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:03:42.319945", "step": 2059, "epoch": 2 }, { "type": "loss", "content": 0.007262484170496464, "timestamp": "2025-09-04 04:03:42.339917", "step": 2060, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:03:50.711637", "step": 2060, "epoch": 2 }, { "type": "pplx", "content": 326.4215391255511, "timestamp": "2025-09-04 04:03:50.713840", "step": 2060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:03:50.791986", "step": 2060, "epoch": 2 }, { "type": "loss", "content": 0.009184667840600014, "timestamp": "2025-09-04 04:03:50.808350", "step": 2061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:50.919805", "step": 2061, "epoch": 2 }, { "type": "loss", "content": 0.07893642038106918, "timestamp": "2025-09-04 04:03:50.940465", "step": 2062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:03:51.033591", "step": 2062, "epoch": 2 }, { "type": "loss", "content": 0.008863512426614761, "timestamp": "2025-09-04 04:03:51.050975", "step": 2063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:51.151163", "step": 2063, "epoch": 2 }, { "type": "loss", "content": 0.007501260843127966, "timestamp": "2025-09-04 04:03:51.170850", "step": 2064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:03:51.267183", "step": 2064, "epoch": 2 }, { "type": "loss", "content": 0.0035026571713387966, "timestamp": "2025-09-04 04:03:51.287661", "step": 2065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:03:51.391160", "step": 2065, "epoch": 2 }, { "type": "loss", "content": 0.010278237983584404, "timestamp": "2025-09-04 04:03:51.408252", "step": 2066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:51.507168", "step": 2066, "epoch": 2 }, { "type": "loss", "content": 0.042156293988227844, "timestamp": "2025-09-04 04:03:51.526171", "step": 2067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:03:51.627938", "step": 2067, "epoch": 2 }, { "type": "loss", "content": 0.024234874173998833, "timestamp": "2025-09-04 04:03:51.647756", "step": 2068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:03:51.752621", "step": 2068, "epoch": 2 }, { "type": "loss", "content": 0.0024930352810770273, "timestamp": "2025-09-04 04:03:51.775261", "step": 2069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:03:51.883717", "step": 2069, "epoch": 2 }, { "type": "loss", "content": 0.043505195528268814, "timestamp": "2025-09-04 04:03:51.903987", "step": 2070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:52.014076", "step": 2070, "epoch": 2 }, { "type": "loss", "content": 0.006645853631198406, "timestamp": "2025-09-04 04:03:52.033063", "step": 2071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:03:52.104148", "step": 2071, "epoch": 2 }, { "type": "loss", "content": 0.02741372399032116, "timestamp": "2025-09-04 04:03:52.117669", "step": 2072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:03:52.210557", "step": 2072, "epoch": 2 }, { "type": "loss", "content": 0.0004673259099945426, "timestamp": "2025-09-04 04:03:52.229709", "step": 2073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:03:52.335470", "step": 2073, "epoch": 2 }, { "type": "loss", "content": 0.023968873545527458, "timestamp": "2025-09-04 04:03:52.355570", "step": 2074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:03:52.490754", "step": 2074, "epoch": 2 }, { "type": "loss", "content": 0.011759743094444275, "timestamp": "2025-09-04 04:03:52.517067", "step": 2075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:03:52.629680", "step": 2075, "epoch": 2 }, { "type": "loss", "content": 0.002521298360079527, "timestamp": "2025-09-04 04:03:52.651213", "step": 2076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:03:52.733607", "step": 2076, "epoch": 2 }, { "type": "loss", "content": 0.019572464749217033, "timestamp": "2025-09-04 04:03:52.750683", "step": 2077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:03:52.840487", "step": 2077, "epoch": 2 }, { "type": "loss", "content": 0.002779679372906685, "timestamp": "2025-09-04 04:03:52.857245", "step": 2078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:52.957934", "step": 2078, "epoch": 2 }, { "type": "loss", "content": 0.026165010407567024, "timestamp": "2025-09-04 04:03:52.976685", "step": 2079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:03:53.076614", "step": 2079, "epoch": 2 }, { "type": "loss", "content": 0.004176696762442589, "timestamp": "2025-09-04 04:03:53.096246", "step": 2080, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:01.472371", "step": 2080, "epoch": 2 }, { "type": "pplx", "content": 320.5430643577628, "timestamp": "2025-09-04 04:04:01.474232", "step": 2080, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2080", "timestamp": "2025-09-04 04:04:01.816278", "step": 2080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:04:01.899311", "step": 2080, "epoch": 2 }, { "type": "loss", "content": 0.0009140381007455289, "timestamp": "2025-09-04 04:04:01.916437", "step": 2081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:02.017975", "step": 2081, "epoch": 2 }, { "type": "loss", "content": 0.0035500312224030495, "timestamp": "2025-09-04 04:04:02.036771", "step": 2082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:04:02.123421", "step": 2082, "epoch": 2 }, { "type": "loss", "content": 0.06194588169455528, "timestamp": "2025-09-04 04:04:02.139060", "step": 2083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:02.234678", "step": 2083, "epoch": 2 }, { "type": "loss", "content": 0.08327718824148178, "timestamp": "2025-09-04 04:04:02.252937", "step": 2084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:04:02.367288", "step": 2084, "epoch": 2 }, { "type": "loss", "content": 0.0026233464013785124, "timestamp": "2025-09-04 04:04:02.391497", "step": 2085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:02.491052", "step": 2085, "epoch": 2 }, { "type": "loss", "content": 0.13432270288467407, "timestamp": "2025-09-04 04:04:02.509568", "step": 2086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:04:02.595298", "step": 2086, "epoch": 2 }, { "type": "loss", "content": 0.026126880198717117, "timestamp": "2025-09-04 04:04:02.610922", "step": 2087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:04:02.732732", "step": 2087, "epoch": 2 }, { "type": "loss", "content": 0.001551034045405686, "timestamp": "2025-09-04 04:04:02.756661", "step": 2088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:02.857941", "step": 2088, "epoch": 2 }, { "type": "loss", "content": 0.002250077435746789, "timestamp": "2025-09-04 04:04:02.879175", "step": 2089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:02.980040", "step": 2089, "epoch": 2 }, { "type": "loss", "content": 0.03148679807782173, "timestamp": "2025-09-04 04:04:02.998944", "step": 2090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:03.093272", "step": 2090, "epoch": 2 }, { "type": "loss", "content": 0.01184455119073391, "timestamp": "2025-09-04 04:04:03.110673", "step": 2091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:03.211623", "step": 2091, "epoch": 2 }, { "type": "loss", "content": 0.0011833092430606484, "timestamp": "2025-09-04 04:04:03.231212", "step": 2092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:03.324401", "step": 2092, "epoch": 2 }, { "type": "loss", "content": 0.003595761489123106, "timestamp": "2025-09-04 04:04:03.343596", "step": 2093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:03.437989", "step": 2093, "epoch": 2 }, { "type": "loss", "content": 0.027849143370985985, "timestamp": "2025-09-04 04:04:03.455117", "step": 2094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:03.556386", "step": 2094, "epoch": 2 }, { "type": "loss", "content": 0.022928999736905098, "timestamp": "2025-09-04 04:04:03.575190", "step": 2095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:03.668150", "step": 2095, "epoch": 2 }, { "type": "loss", "content": 0.07359001040458679, "timestamp": "2025-09-04 04:04:03.686058", "step": 2096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:03.789756", "step": 2096, "epoch": 2 }, { "type": "loss", "content": 0.05725327506661415, "timestamp": "2025-09-04 04:04:03.811698", "step": 2097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:03.915279", "step": 2097, "epoch": 2 }, { "type": "loss", "content": 0.04447811469435692, "timestamp": "2025-09-04 04:04:03.934528", "step": 2098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:04.039378", "step": 2098, "epoch": 2 }, { "type": "loss", "content": 0.0025624537374824286, "timestamp": "2025-09-04 04:04:04.057979", "step": 2099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:04.158399", "step": 2099, "epoch": 2 }, { "type": "loss", "content": 0.0035990336909890175, "timestamp": "2025-09-04 04:04:04.178016", "step": 2100, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:12.532858", "step": 2100, "epoch": 2 }, { "type": "pplx", "content": 314.23096938736927, "timestamp": "2025-09-04 04:04:12.534871", "step": 2100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:12.620879", "step": 2100, "epoch": 2 }, { "type": "loss", "content": 0.003338422393426299, "timestamp": "2025-09-04 04:04:12.639036", "step": 2101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:04:12.731933", "step": 2101, "epoch": 2 }, { "type": "loss", "content": 0.002954112831503153, "timestamp": "2025-09-04 04:04:12.749327", "step": 2102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:12.850749", "step": 2102, "epoch": 2 }, { "type": "loss", "content": 0.009453012607991695, "timestamp": "2025-09-04 04:04:12.869799", "step": 2103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:04:13.007718", "step": 2103, "epoch": 2 }, { "type": "loss", "content": 0.020522311329841614, "timestamp": "2025-09-04 04:04:13.034524", "step": 2104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:13.121973", "step": 2104, "epoch": 2 }, { "type": "loss", "content": 0.024143625050783157, "timestamp": "2025-09-04 04:04:13.140329", "step": 2105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:13.232648", "step": 2105, "epoch": 2 }, { "type": "loss", "content": 0.00796930119395256, "timestamp": "2025-09-04 04:04:13.249377", "step": 2106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:13.351215", "step": 2106, "epoch": 2 }, { "type": "loss", "content": 0.000632771581877023, "timestamp": "2025-09-04 04:04:13.370206", "step": 2107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:13.460383", "step": 2107, "epoch": 2 }, { "type": "loss", "content": 0.002220430178567767, "timestamp": "2025-09-04 04:04:13.478003", "step": 2108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:13.583120", "step": 2108, "epoch": 2 }, { "type": "loss", "content": 0.01635553501546383, "timestamp": "2025-09-04 04:04:13.604209", "step": 2109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:13.710252", "step": 2109, "epoch": 2 }, { "type": "loss", "content": 0.017017148435115814, "timestamp": "2025-09-04 04:04:13.729384", "step": 2110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:04:13.806648", "step": 2110, "epoch": 2 }, { "type": "loss", "content": 0.0019480792107060552, "timestamp": "2025-09-04 04:04:13.820438", "step": 2111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:13.929794", "step": 2111, "epoch": 2 }, { "type": "loss", "content": 0.012161768041551113, "timestamp": "2025-09-04 04:04:13.950904", "step": 2112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:14.049070", "step": 2112, "epoch": 2 }, { "type": "loss", "content": 0.048178572207689285, "timestamp": "2025-09-04 04:04:14.069251", "step": 2113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:14.173919", "step": 2113, "epoch": 2 }, { "type": "loss", "content": 0.003165569854900241, "timestamp": "2025-09-04 04:04:14.192987", "step": 2114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:14.297053", "step": 2114, "epoch": 2 }, { "type": "loss", "content": 0.015800610184669495, "timestamp": "2025-09-04 04:04:14.316020", "step": 2115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:14.416443", "step": 2115, "epoch": 2 }, { "type": "loss", "content": 0.048695262521505356, "timestamp": "2025-09-04 04:04:14.435860", "step": 2116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:14.535495", "step": 2116, "epoch": 2 }, { "type": "loss", "content": 0.002211391692981124, "timestamp": "2025-09-04 04:04:14.556550", "step": 2117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:04:14.666573", "step": 2117, "epoch": 2 }, { "type": "loss", "content": 0.006672736257314682, "timestamp": "2025-09-04 04:04:14.687028", "step": 2118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:04:14.803052", "step": 2118, "epoch": 2 }, { "type": "loss", "content": 0.010592850856482983, "timestamp": "2025-09-04 04:04:14.825098", "step": 2119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:14.932592", "step": 2119, "epoch": 2 }, { "type": "loss", "content": 0.020320625975728035, "timestamp": "2025-09-04 04:04:14.953122", "step": 2120, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:23.313391", "step": 2120, "epoch": 2 }, { "type": "pplx", "content": 312.7804928893332, "timestamp": "2025-09-04 04:04:23.315624", "step": 2120, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2120", "timestamp": "2025-09-04 04:04:23.681997", "step": 2120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:23.781623", "step": 2120, "epoch": 2 }, { "type": "loss", "content": 0.010732216760516167, "timestamp": "2025-09-04 04:04:23.802720", "step": 2121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:04:23.889401", "step": 2121, "epoch": 2 }, { "type": "loss", "content": 0.005511862691491842, "timestamp": "2025-09-04 04:04:23.905182", "step": 2122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:23.997677", "step": 2122, "epoch": 2 }, { "type": "loss", "content": 0.0008789485436864197, "timestamp": "2025-09-04 04:04:24.014839", "step": 2123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 04:04:24.199203", "step": 2123, "epoch": 2 }, { "type": "loss", "content": 0.018326403573155403, "timestamp": "2025-09-04 04:04:24.234414", "step": 2124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:24.324777", "step": 2124, "epoch": 2 }, { "type": "loss", "content": 0.012555736117064953, "timestamp": "2025-09-04 04:04:24.343404", "step": 2125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:24.453436", "step": 2125, "epoch": 2 }, { "type": "loss", "content": 0.0006459427531808615, "timestamp": "2025-09-04 04:04:24.473852", "step": 2126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:24.575742", "step": 2126, "epoch": 2 }, { "type": "loss", "content": 0.0454648919403553, "timestamp": "2025-09-04 04:04:24.594874", "step": 2127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:04:24.680520", "step": 2127, "epoch": 2 }, { "type": "loss", "content": 0.10542017221450806, "timestamp": "2025-09-04 04:04:24.696351", "step": 2128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:04:24.782005", "step": 2128, "epoch": 2 }, { "type": "loss", "content": 0.014704999513924122, "timestamp": "2025-09-04 04:04:24.799271", "step": 2129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:24.889998", "step": 2129, "epoch": 2 }, { "type": "loss", "content": 0.0018945076735690236, "timestamp": "2025-09-04 04:04:24.906873", "step": 2130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:25.005752", "step": 2130, "epoch": 2 }, { "type": "loss", "content": 0.03164684399962425, "timestamp": "2025-09-04 04:04:25.024319", "step": 2131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:25.122975", "step": 2131, "epoch": 2 }, { "type": "loss", "content": 0.0005463764537125826, "timestamp": "2025-09-04 04:04:25.142239", "step": 2132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 04:04:25.323234", "step": 2132, "epoch": 2 }, { "type": "loss", "content": 0.008721102960407734, "timestamp": "2025-09-04 04:04:25.361001", "step": 2133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:25.459833", "step": 2133, "epoch": 2 }, { "type": "loss", "content": 0.008502230979502201, "timestamp": "2025-09-04 04:04:25.478374", "step": 2134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:04:25.555190", "step": 2134, "epoch": 2 }, { "type": "loss", "content": 0.06675466895103455, "timestamp": "2025-09-04 04:04:25.569241", "step": 2135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:25.670339", "step": 2135, "epoch": 2 }, { "type": "loss", "content": 0.006031819619238377, "timestamp": "2025-09-04 04:04:25.689924", "step": 2136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:25.788530", "step": 2136, "epoch": 2 }, { "type": "loss", "content": 0.005401272792369127, "timestamp": "2025-09-04 04:04:25.809427", "step": 2137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:25.904126", "step": 2137, "epoch": 2 }, { "type": "loss", "content": 0.008368422277271748, "timestamp": "2025-09-04 04:04:25.921786", "step": 2138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:26.024282", "step": 2138, "epoch": 2 }, { "type": "loss", "content": 0.004673839081078768, "timestamp": "2025-09-04 04:04:26.043265", "step": 2139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:26.153282", "step": 2139, "epoch": 2 }, { "type": "loss", "content": 0.03570368513464928, "timestamp": "2025-09-04 04:04:26.174401", "step": 2140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:34.544277", "step": 2140, "epoch": 2 }, { "type": "pplx", "content": 310.4419842128476, "timestamp": "2025-09-04 04:04:34.546820", "step": 2140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:34.636749", "step": 2140, "epoch": 2 }, { "type": "loss", "content": 0.0012499855365604162, "timestamp": "2025-09-04 04:04:34.655620", "step": 2141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:04:34.731528", "step": 2141, "epoch": 2 }, { "type": "loss", "content": 0.006291459314525127, "timestamp": "2025-09-04 04:04:34.745151", "step": 2142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:04:34.853013", "step": 2142, "epoch": 2 }, { "type": "loss", "content": 0.0022742494475096464, "timestamp": "2025-09-04 04:04:34.873107", "step": 2143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:34.984340", "step": 2143, "epoch": 2 }, { "type": "loss", "content": 0.01345337089151144, "timestamp": "2025-09-04 04:04:35.005729", "step": 2144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:35.094070", "step": 2144, "epoch": 2 }, { "type": "loss", "content": 0.03179781138896942, "timestamp": "2025-09-04 04:04:35.112411", "step": 2145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:35.212528", "step": 2145, "epoch": 2 }, { "type": "loss", "content": 0.004378853365778923, "timestamp": "2025-09-04 04:04:35.231121", "step": 2146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 04:04:35.435817", "step": 2146, "epoch": 2 }, { "type": "loss", "content": 0.0009116848814301193, "timestamp": "2025-09-04 04:04:35.475134", "step": 2147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:35.578468", "step": 2147, "epoch": 2 }, { "type": "loss", "content": 0.007793641183525324, "timestamp": "2025-09-04 04:04:35.598367", "step": 2148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:04:35.705536", "step": 2148, "epoch": 2 }, { "type": "loss", "content": 0.009497767314314842, "timestamp": "2025-09-04 04:04:35.728150", "step": 2149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:04:35.812631", "step": 2149, "epoch": 2 }, { "type": "loss", "content": 0.038677141070365906, "timestamp": "2025-09-04 04:04:35.827760", "step": 2150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:04:35.910668", "step": 2150, "epoch": 2 }, { "type": "loss", "content": 0.006493302993476391, "timestamp": "2025-09-04 04:04:35.925541", "step": 2151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:36.021059", "step": 2151, "epoch": 2 }, { "type": "loss", "content": 0.006713113281875849, "timestamp": "2025-09-04 04:04:36.039260", "step": 2152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:36.144671", "step": 2152, "epoch": 2 }, { "type": "loss", "content": 0.05974787473678589, "timestamp": "2025-09-04 04:04:36.165757", "step": 2153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:36.257893", "step": 2153, "epoch": 2 }, { "type": "loss", "content": 0.005305037368088961, "timestamp": "2025-09-04 04:04:36.274415", "step": 2154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:36.375842", "step": 2154, "epoch": 2 }, { "type": "loss", "content": 0.018993912264704704, "timestamp": "2025-09-04 04:04:36.394459", "step": 2155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:04:36.471725", "step": 2155, "epoch": 2 }, { "type": "loss", "content": 0.02348383143544197, "timestamp": "2025-09-04 04:04:36.486467", "step": 2156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:36.578110", "step": 2156, "epoch": 2 }, { "type": "loss", "content": 0.020909776911139488, "timestamp": "2025-09-04 04:04:36.596903", "step": 2157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:36.687043", "step": 2157, "epoch": 2 }, { "type": "loss", "content": 0.01592099852859974, "timestamp": "2025-09-04 04:04:36.703800", "step": 2158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:36.813038", "step": 2158, "epoch": 2 }, { "type": "loss", "content": 0.045487433671951294, "timestamp": "2025-09-04 04:04:36.833248", "step": 2159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:36.937059", "step": 2159, "epoch": 2 }, { "type": "loss", "content": 0.02561834827065468, "timestamp": "2025-09-04 04:04:36.956799", "step": 2160, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:45.360928", "step": 2160, "epoch": 2 }, { "type": "pplx", "content": 308.36794591931954, "timestamp": "2025-09-04 04:04:45.362993", "step": 2160, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2160", "timestamp": "2025-09-04 04:04:45.876044", "step": 2160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:45.978684", "step": 2160, "epoch": 2 }, { "type": "loss", "content": 0.01050900761038065, "timestamp": "2025-09-04 04:04:46.000386", "step": 2161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:46.105099", "step": 2161, "epoch": 2 }, { "type": "loss", "content": 0.0030154536943882704, "timestamp": "2025-09-04 04:04:46.124404", "step": 2162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:46.227803", "step": 2162, "epoch": 2 }, { "type": "loss", "content": 0.019422519952058792, "timestamp": "2025-09-04 04:04:46.247095", "step": 2163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:46.352915", "step": 2163, "epoch": 2 }, { "type": "loss", "content": 0.05433286726474762, "timestamp": "2025-09-04 04:04:46.373652", "step": 2164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:46.464426", "step": 2164, "epoch": 2 }, { "type": "loss", "content": 0.020474649965763092, "timestamp": "2025-09-04 04:04:46.483111", "step": 2165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:46.584033", "step": 2165, "epoch": 2 }, { "type": "loss", "content": 0.004737554118037224, "timestamp": "2025-09-04 04:04:46.602973", "step": 2166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 04:04:46.733689", "step": 2166, "epoch": 2 }, { "type": "loss", "content": 0.0017673440743237734, "timestamp": "2025-09-04 04:04:46.758348", "step": 2167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:46.859429", "step": 2167, "epoch": 2 }, { "type": "loss", "content": 0.052274227142333984, "timestamp": "2025-09-04 04:04:46.879084", "step": 2168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:46.977326", "step": 2168, "epoch": 2 }, { "type": "loss", "content": 0.004377824254333973, "timestamp": "2025-09-04 04:04:46.997720", "step": 2169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:47.103598", "step": 2169, "epoch": 2 }, { "type": "loss", "content": 0.04207998141646385, "timestamp": "2025-09-04 04:04:47.123611", "step": 2170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:04:47.225813", "step": 2170, "epoch": 2 }, { "type": "loss", "content": 0.007436053827404976, "timestamp": "2025-09-04 04:04:47.243190", "step": 2171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:04:47.337262", "step": 2171, "epoch": 2 }, { "type": "loss", "content": 0.007250096648931503, "timestamp": "2025-09-04 04:04:47.353490", "step": 2172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:04:47.434923", "step": 2172, "epoch": 2 }, { "type": "loss", "content": 0.0015918496064841747, "timestamp": "2025-09-04 04:04:47.451549", "step": 2173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:47.556307", "step": 2173, "epoch": 2 }, { "type": "loss", "content": 0.01956302858889103, "timestamp": "2025-09-04 04:04:47.575618", "step": 2174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:47.681540", "step": 2174, "epoch": 2 }, { "type": "loss", "content": 0.04470936954021454, "timestamp": "2025-09-04 04:04:47.701563", "step": 2175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:47.802526", "step": 2175, "epoch": 2 }, { "type": "loss", "content": 0.03270625323057175, "timestamp": "2025-09-04 04:04:47.822128", "step": 2176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:47.930127", "step": 2176, "epoch": 2 }, { "type": "loss", "content": 0.0014341897331178188, "timestamp": "2025-09-04 04:04:47.952666", "step": 2177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:48.053251", "step": 2177, "epoch": 2 }, { "type": "loss", "content": 0.019161755219101906, "timestamp": "2025-09-04 04:04:48.071882", "step": 2178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:48.181349", "step": 2178, "epoch": 2 }, { "type": "loss", "content": 0.0018422268331050873, "timestamp": "2025-09-04 04:04:48.201898", "step": 2179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:48.295085", "step": 2179, "epoch": 2 }, { "type": "loss", "content": 0.017137227579951286, "timestamp": "2025-09-04 04:04:48.312984", "step": 2180, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:04:56.709103", "step": 2180, "epoch": 2 }, { "type": "pplx", "content": 308.3335132373899, "timestamp": "2025-09-04 04:04:56.710918", "step": 2180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:56.799844", "step": 2180, "epoch": 2 }, { "type": "loss", "content": 0.013462487608194351, "timestamp": "2025-09-04 04:04:56.818604", "step": 2181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:04:56.915894", "step": 2181, "epoch": 2 }, { "type": "loss", "content": 0.015639422461390495, "timestamp": "2025-09-04 04:04:56.933526", "step": 2182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:57.027042", "step": 2182, "epoch": 2 }, { "type": "loss", "content": 0.008944478817284107, "timestamp": "2025-09-04 04:04:57.044188", "step": 2183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:57.147968", "step": 2183, "epoch": 2 }, { "type": "loss", "content": 0.037470266222953796, "timestamp": "2025-09-04 04:04:57.168011", "step": 2184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:57.266544", "step": 2184, "epoch": 2 }, { "type": "loss", "content": 0.04356255754828453, "timestamp": "2025-09-04 04:04:57.287325", "step": 2185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:04:57.380483", "step": 2185, "epoch": 2 }, { "type": "loss", "content": 0.012546907179057598, "timestamp": "2025-09-04 04:04:57.397427", "step": 2186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:57.504540", "step": 2186, "epoch": 2 }, { "type": "loss", "content": 0.0026249419897794724, "timestamp": "2025-09-04 04:04:57.524297", "step": 2187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:57.628114", "step": 2187, "epoch": 2 }, { "type": "loss", "content": 0.007875807583332062, "timestamp": "2025-09-04 04:04:57.647908", "step": 2188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:04:57.755912", "step": 2188, "epoch": 2 }, { "type": "loss", "content": 0.020884279161691666, "timestamp": "2025-09-04 04:04:57.778278", "step": 2189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:57.881872", "step": 2189, "epoch": 2 }, { "type": "loss", "content": 0.010031616315245628, "timestamp": "2025-09-04 04:04:57.901024", "step": 2190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:04:57.984919", "step": 2190, "epoch": 2 }, { "type": "loss", "content": 0.01399738434702158, "timestamp": "2025-09-04 04:04:58.000170", "step": 2191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:58.099531", "step": 2191, "epoch": 2 }, { "type": "loss", "content": 0.008733275346457958, "timestamp": "2025-09-04 04:04:58.118882", "step": 2192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:04:58.221416", "step": 2192, "epoch": 2 }, { "type": "loss", "content": 0.03269082307815552, "timestamp": "2025-09-04 04:04:58.241771", "step": 2193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:04:58.343688", "step": 2193, "epoch": 2 }, { "type": "loss", "content": 0.03407390043139458, "timestamp": "2025-09-04 04:04:58.362657", "step": 2194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:58.462986", "step": 2194, "epoch": 2 }, { "type": "loss", "content": 0.016373533755540848, "timestamp": "2025-09-04 04:04:58.481948", "step": 2195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:04:58.582160", "step": 2195, "epoch": 2 }, { "type": "loss", "content": 0.002695757895708084, "timestamp": "2025-09-04 04:04:58.601793", "step": 2196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:58.702197", "step": 2196, "epoch": 2 }, { "type": "loss", "content": 0.03435313701629639, "timestamp": "2025-09-04 04:04:58.723146", "step": 2197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:04:58.829250", "step": 2197, "epoch": 2 }, { "type": "loss", "content": 0.011219386011362076, "timestamp": "2025-09-04 04:04:58.849348", "step": 2198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:04:58.940281", "step": 2198, "epoch": 2 }, { "type": "loss", "content": 0.018078068271279335, "timestamp": "2025-09-04 04:04:58.957188", "step": 2199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:04:59.062258", "step": 2199, "epoch": 2 }, { "type": "loss", "content": 0.028233405202627182, "timestamp": "2025-09-04 04:04:59.082408", "step": 2200, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:05:07.470250", "step": 2200, "epoch": 2 }, { "type": "pplx", "content": 312.6614110155863, "timestamp": "2025-09-04 04:05:07.473143", "step": 2200, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2200", "timestamp": "2025-09-04 04:05:07.823843", "step": 2200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:07.923299", "step": 2200, "epoch": 2 }, { "type": "loss", "content": 0.016595905646681786, "timestamp": "2025-09-04 04:05:07.944190", "step": 2201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:08.039605", "step": 2201, "epoch": 2 }, { "type": "loss", "content": 0.016727754846215248, "timestamp": "2025-09-04 04:05:08.056980", "step": 2202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:08.149959", "step": 2202, "epoch": 2 }, { "type": "loss", "content": 0.008159826509654522, "timestamp": "2025-09-04 04:05:08.166773", "step": 2203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:05:08.254409", "step": 2203, "epoch": 2 }, { "type": "loss", "content": 0.016688521951436996, "timestamp": "2025-09-04 04:05:08.270697", "step": 2204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:08.363758", "step": 2204, "epoch": 2 }, { "type": "loss", "content": 0.003086843527853489, "timestamp": "2025-09-04 04:05:08.382943", "step": 2205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:08.487266", "step": 2205, "epoch": 2 }, { "type": "loss", "content": 0.003638209542259574, "timestamp": "2025-09-04 04:05:08.506389", "step": 2206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:08.600858", "step": 2206, "epoch": 2 }, { "type": "loss", "content": 0.01052644569426775, "timestamp": "2025-09-04 04:05:08.618092", "step": 2207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:08.722308", "step": 2207, "epoch": 2 }, { "type": "loss", "content": 0.022792614996433258, "timestamp": "2025-09-04 04:05:08.742369", "step": 2208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:08.846839", "step": 2208, "epoch": 2 }, { "type": "loss", "content": 0.002457141410559416, "timestamp": "2025-09-04 04:05:08.868936", "step": 2209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:08.959620", "step": 2209, "epoch": 2 }, { "type": "loss", "content": 0.06071026250720024, "timestamp": "2025-09-04 04:05:08.976336", "step": 2210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:09.082718", "step": 2210, "epoch": 2 }, { "type": "loss", "content": 0.03304049372673035, "timestamp": "2025-09-04 04:05:09.101999", "step": 2211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:09.205527", "step": 2211, "epoch": 2 }, { "type": "loss", "content": 0.00047334007103927433, "timestamp": "2025-09-04 04:05:09.225143", "step": 2212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:09.316744", "step": 2212, "epoch": 2 }, { "type": "loss", "content": 0.03290455788373947, "timestamp": "2025-09-04 04:05:09.335668", "step": 2213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:09.430257", "step": 2213, "epoch": 2 }, { "type": "loss", "content": 0.007393876556307077, "timestamp": "2025-09-04 04:05:09.447602", "step": 2214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:09.551318", "step": 2214, "epoch": 2 }, { "type": "loss", "content": 0.012073284946382046, "timestamp": "2025-09-04 04:05:09.570585", "step": 2215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:09.662952", "step": 2215, "epoch": 2 }, { "type": "loss", "content": 0.07519317418336868, "timestamp": "2025-09-04 04:05:09.680572", "step": 2216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:09.781402", "step": 2216, "epoch": 2 }, { "type": "loss", "content": 0.04182714223861694, "timestamp": "2025-09-04 04:05:09.801878", "step": 2217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:09.907420", "step": 2217, "epoch": 2 }, { "type": "loss", "content": 0.0027501049917191267, "timestamp": "2025-09-04 04:05:09.926426", "step": 2218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:05:10.062445", "step": 2218, "epoch": 2 }, { "type": "loss", "content": 0.007170806173235178, "timestamp": "2025-09-04 04:05:10.088381", "step": 2219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:10.184360", "step": 2219, "epoch": 2 }, { "type": "loss", "content": 0.01622505858540535, "timestamp": "2025-09-04 04:05:10.202528", "step": 2220, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:05:18.580414", "step": 2220, "epoch": 2 }, { "type": "pplx", "content": 317.98368699126075, "timestamp": "2025-09-04 04:05:18.582752", "step": 2220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:18.682258", "step": 2220, "epoch": 2 }, { "type": "loss", "content": 0.0025658165104687214, "timestamp": "2025-09-04 04:05:18.703606", "step": 2221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:05:18.778872", "step": 2221, "epoch": 2 }, { "type": "loss", "content": 0.013223507441580296, "timestamp": "2025-09-04 04:05:18.792344", "step": 2222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:18.885172", "step": 2222, "epoch": 2 }, { "type": "loss", "content": 0.010641835629940033, "timestamp": "2025-09-04 04:05:18.902241", "step": 2223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:05:19.036671", "step": 2223, "epoch": 2 }, { "type": "loss", "content": 0.011351378634572029, "timestamp": "2025-09-04 04:05:19.063427", "step": 2224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:19.164795", "step": 2224, "epoch": 2 }, { "type": "loss", "content": 0.00913854967802763, "timestamp": "2025-09-04 04:05:19.186006", "step": 2225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:05:19.283788", "step": 2225, "epoch": 2 }, { "type": "loss", "content": 0.034181058406829834, "timestamp": "2025-09-04 04:05:19.301287", "step": 2226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:19.392275", "step": 2226, "epoch": 2 }, { "type": "loss", "content": 0.0016610038001090288, "timestamp": "2025-09-04 04:05:19.409064", "step": 2227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:05:19.485169", "step": 2227, "epoch": 2 }, { "type": "loss", "content": 0.02116265520453453, "timestamp": "2025-09-04 04:05:19.499668", "step": 2228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:05:19.601431", "step": 2228, "epoch": 2 }, { "type": "loss", "content": 0.006079908460378647, "timestamp": "2025-09-04 04:05:19.621854", "step": 2229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:05:19.731909", "step": 2229, "epoch": 2 }, { "type": "loss", "content": 0.0019000859465450048, "timestamp": "2025-09-04 04:05:19.752516", "step": 2230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:19.847153", "step": 2230, "epoch": 2 }, { "type": "loss", "content": 0.03679632768034935, "timestamp": "2025-09-04 04:05:19.864479", "step": 2231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:05:19.986974", "step": 2231, "epoch": 2 }, { "type": "loss", "content": 0.004042410757392645, "timestamp": "2025-09-04 04:05:20.011031", "step": 2232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:20.116757", "step": 2232, "epoch": 2 }, { "type": "loss", "content": 0.011536908335983753, "timestamp": "2025-09-04 04:05:20.137776", "step": 2233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 04:05:20.313896", "step": 2233, "epoch": 2 }, { "type": "loss", "content": 0.0038091272581368685, "timestamp": "2025-09-04 04:05:20.346627", "step": 2234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:20.452628", "step": 2234, "epoch": 2 }, { "type": "loss", "content": 0.002943043364211917, "timestamp": "2025-09-04 04:05:20.472632", "step": 2235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:20.579930", "step": 2235, "epoch": 2 }, { "type": "loss", "content": 0.009634853340685368, "timestamp": "2025-09-04 04:05:20.600869", "step": 2236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:20.690783", "step": 2236, "epoch": 2 }, { "type": "loss", "content": 0.022745907306671143, "timestamp": "2025-09-04 04:05:20.709227", "step": 2237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:20.818333", "step": 2237, "epoch": 2 }, { "type": "loss", "content": 0.003655359148979187, "timestamp": "2025-09-04 04:05:20.838412", "step": 2238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:05:20.930756", "step": 2238, "epoch": 2 }, { "type": "loss", "content": 0.016437901183962822, "timestamp": "2025-09-04 04:05:20.946338", "step": 2239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:21.053890", "step": 2239, "epoch": 2 }, { "type": "loss", "content": 0.01712539792060852, "timestamp": "2025-09-04 04:05:21.074793", "step": 2240, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:05:29.475438", "step": 2240, "epoch": 2 }, { "type": "pplx", "content": 320.6073536444029, "timestamp": "2025-09-04 04:05:29.477692", "step": 2240, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2240", "timestamp": "2025-09-04 04:05:29.965605", "step": 2240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:30.067717", "step": 2240, "epoch": 2 }, { "type": "loss", "content": 0.003082787152379751, "timestamp": "2025-09-04 04:05:30.089527", "step": 2241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:05:30.294473", "step": 2241, "epoch": 2 }, { "type": "loss", "content": 0.06166600435972214, "timestamp": "2025-09-04 04:05:30.333615", "step": 2242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:05:30.412898", "step": 2242, "epoch": 2 }, { "type": "loss", "content": 0.007548161782324314, "timestamp": "2025-09-04 04:05:30.427094", "step": 2243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:05:30.544314", "step": 2243, "epoch": 2 }, { "type": "loss", "content": 0.026522303000092506, "timestamp": "2025-09-04 04:05:30.567160", "step": 2244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:05:30.675609", "step": 2244, "epoch": 2 }, { "type": "loss", "content": 0.04962928593158722, "timestamp": "2025-09-04 04:05:30.697859", "step": 2245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:30.789135", "step": 2245, "epoch": 2 }, { "type": "loss", "content": 0.001753210905008018, "timestamp": "2025-09-04 04:05:30.805937", "step": 2246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:05:30.891711", "step": 2246, "epoch": 2 }, { "type": "loss", "content": 0.0030653884168714285, "timestamp": "2025-09-04 04:05:30.907210", "step": 2247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:31.010986", "step": 2247, "epoch": 2 }, { "type": "loss", "content": 0.04342466592788696, "timestamp": "2025-09-04 04:05:31.030783", "step": 2248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:05:31.123299", "step": 2248, "epoch": 2 }, { "type": "loss", "content": 0.02544695883989334, "timestamp": "2025-09-04 04:05:31.142550", "step": 2249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 04:05:31.317706", "step": 2249, "epoch": 2 }, { "type": "loss", "content": 0.005824543070048094, "timestamp": "2025-09-04 04:05:31.352320", "step": 2250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:31.447443", "step": 2250, "epoch": 2 }, { "type": "loss", "content": 0.004376427736133337, "timestamp": "2025-09-04 04:05:31.464644", "step": 2251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:05:31.548999", "step": 2251, "epoch": 2 }, { "type": "loss", "content": 0.03957490622997284, "timestamp": "2025-09-04 04:05:31.564806", "step": 2252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:05:31.649806", "step": 2252, "epoch": 2 }, { "type": "loss", "content": 0.006183784920722246, "timestamp": "2025-09-04 04:05:31.666917", "step": 2253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:05:31.745329", "step": 2253, "epoch": 2 }, { "type": "loss", "content": 0.004066129215061665, "timestamp": "2025-09-04 04:05:31.759132", "step": 2254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:31.863127", "step": 2254, "epoch": 2 }, { "type": "loss", "content": 0.009590189903974533, "timestamp": "2025-09-04 04:05:31.882110", "step": 2255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:05:31.979430", "step": 2255, "epoch": 2 }, { "type": "loss", "content": 0.015844259411096573, "timestamp": "2025-09-04 04:05:31.997578", "step": 2256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:05:32.104647", "step": 2256, "epoch": 2 }, { "type": "loss", "content": 0.007126522250473499, "timestamp": "2025-09-04 04:05:32.126836", "step": 2257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:05:32.331721", "step": 2257, "epoch": 2 }, { "type": "loss", "content": 0.0030276242177933455, "timestamp": "2025-09-04 04:05:32.370738", "step": 2258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:32.476655", "step": 2258, "epoch": 2 }, { "type": "loss", "content": 0.01780683733522892, "timestamp": "2025-09-04 04:05:32.495823", "step": 2259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:05:32.596864", "step": 2259, "epoch": 2 }, { "type": "loss", "content": 0.0012856582179665565, "timestamp": "2025-09-04 04:05:32.616120", "step": 2260, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:05:41.117371", "step": 2260, "epoch": 2 }, { "type": "pplx", "content": 319.322025056002, "timestamp": "2025-09-04 04:05:41.119565", "step": 2260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:05:41.234127", "step": 2260, "epoch": 2 }, { "type": "loss", "content": 0.10266165435314178, "timestamp": "2025-09-04 04:05:41.257967", "step": 2261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:41.361408", "step": 2261, "epoch": 2 }, { "type": "loss", "content": 0.013549219816923141, "timestamp": "2025-09-04 04:05:41.380593", "step": 2262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:41.482941", "step": 2262, "epoch": 2 }, { "type": "loss", "content": 0.0038013458251953125, "timestamp": "2025-09-04 04:05:41.501959", "step": 2263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:05:41.596910", "step": 2263, "epoch": 2 }, { "type": "loss", "content": 0.06860756129026413, "timestamp": "2025-09-04 04:05:41.615157", "step": 2264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:05:41.707053", "step": 2264, "epoch": 2 }, { "type": "loss", "content": 0.04129083827137947, "timestamp": "2025-09-04 04:05:41.725972", "step": 2265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:41.826374", "step": 2265, "epoch": 2 }, { "type": "loss", "content": 0.037230126559734344, "timestamp": "2025-09-04 04:05:41.845042", "step": 2266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:41.947523", "step": 2266, "epoch": 2 }, { "type": "loss", "content": 0.009212334640324116, "timestamp": "2025-09-04 04:05:41.966655", "step": 2267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:42.071971", "step": 2267, "epoch": 2 }, { "type": "loss", "content": 0.013861672952771187, "timestamp": "2025-09-04 04:05:42.091888", "step": 2268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:05:42.166045", "step": 2268, "epoch": 2 }, { "type": "loss", "content": 0.007805598899722099, "timestamp": "2025-09-04 04:05:42.180729", "step": 2269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:42.285975", "step": 2269, "epoch": 2 }, { "type": "loss", "content": 0.0023477617651224136, "timestamp": "2025-09-04 04:05:42.305107", "step": 2270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:42.405410", "step": 2270, "epoch": 2 }, { "type": "loss", "content": 0.0031649265438318253, "timestamp": "2025-09-04 04:05:42.424225", "step": 2271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:05:42.524728", "step": 2271, "epoch": 2 }, { "type": "loss", "content": 0.018990959972143173, "timestamp": "2025-09-04 04:05:42.544057", "step": 2272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:42.632585", "step": 2272, "epoch": 2 }, { "type": "loss", "content": 0.011629465036094189, "timestamp": "2025-09-04 04:05:42.650953", "step": 2273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:05:42.729086", "step": 2273, "epoch": 2 }, { "type": "loss", "content": 0.0017159185372292995, "timestamp": "2025-09-04 04:05:42.743117", "step": 2274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 04:05:42.957175", "step": 2274, "epoch": 2 }, { "type": "loss", "content": 0.012947708368301392, "timestamp": "2025-09-04 04:05:42.997905", "step": 2275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:05:43.092714", "step": 2275, "epoch": 2 }, { "type": "loss", "content": 0.03470579907298088, "timestamp": "2025-09-04 04:05:43.110971", "step": 2276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:05:43.187661", "step": 2276, "epoch": 2 }, { "type": "loss", "content": 0.005358322989195585, "timestamp": "2025-09-04 04:05:43.202956", "step": 2277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:43.306397", "step": 2277, "epoch": 2 }, { "type": "loss", "content": 0.008757129311561584, "timestamp": "2025-09-04 04:05:43.325338", "step": 2278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 04:05:43.453113", "step": 2278, "epoch": 2 }, { "type": "loss", "content": 0.001655671396292746, "timestamp": "2025-09-04 04:05:43.477203", "step": 2279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:43.581200", "step": 2279, "epoch": 2 }, { "type": "loss", "content": 0.003800937905907631, "timestamp": "2025-09-04 04:05:43.600971", "step": 2280, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:05:51.995079", "step": 2280, "epoch": 2 }, { "type": "pplx", "content": 310.31223764687115, "timestamp": "2025-09-04 04:05:51.997642", "step": 2280, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2280", "timestamp": "2025-09-04 04:05:52.345100", "step": 2280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:05:52.420573", "step": 2280, "epoch": 2 }, { "type": "loss", "content": 0.0019430589163675904, "timestamp": "2025-09-04 04:05:52.435958", "step": 2281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:05:52.540294", "step": 2281, "epoch": 2 }, { "type": "loss", "content": 0.035052590072155, "timestamp": "2025-09-04 04:05:52.559389", "step": 2282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:52.661540", "step": 2282, "epoch": 2 }, { "type": "loss", "content": 0.026764625683426857, "timestamp": "2025-09-04 04:05:52.680240", "step": 2283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:05:52.759445", "step": 2283, "epoch": 2 }, { "type": "loss", "content": 0.010539744980633259, "timestamp": "2025-09-04 04:05:52.774253", "step": 2284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:52.872872", "step": 2284, "epoch": 2 }, { "type": "loss", "content": 0.03832215815782547, "timestamp": "2025-09-04 04:05:52.893433", "step": 2285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:05:53.002613", "step": 2285, "epoch": 2 }, { "type": "loss", "content": 0.005969279911369085, "timestamp": "2025-09-04 04:05:53.022737", "step": 2286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:53.115750", "step": 2286, "epoch": 2 }, { "type": "loss", "content": 0.011899287812411785, "timestamp": "2025-09-04 04:05:53.132397", "step": 2287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:05:53.220053", "step": 2287, "epoch": 2 }, { "type": "loss", "content": 0.01706940494477749, "timestamp": "2025-09-04 04:05:53.236433", "step": 2288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:05:53.325483", "step": 2288, "epoch": 2 }, { "type": "loss", "content": 0.010303936898708344, "timestamp": "2025-09-04 04:05:53.343787", "step": 2289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:05:53.420130", "step": 2289, "epoch": 2 }, { "type": "loss", "content": 0.0042968811467289925, "timestamp": "2025-09-04 04:05:53.433846", "step": 2290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:53.527246", "step": 2290, "epoch": 2 }, { "type": "loss", "content": 0.11020837724208832, "timestamp": "2025-09-04 04:05:53.544270", "step": 2291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:53.646159", "step": 2291, "epoch": 2 }, { "type": "loss", "content": 0.2083846777677536, "timestamp": "2025-09-04 04:05:53.665376", "step": 2292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:05:53.766145", "step": 2292, "epoch": 2 }, { "type": "loss", "content": 0.007627859245985746, "timestamp": "2025-09-04 04:05:53.787078", "step": 2293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:53.885519", "step": 2293, "epoch": 2 }, { "type": "loss", "content": 0.02781866304576397, "timestamp": "2025-09-04 04:05:53.902604", "step": 2294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:54.004580", "step": 2294, "epoch": 2 }, { "type": "loss", "content": 0.0024229728151112795, "timestamp": "2025-09-04 04:05:54.023213", "step": 2295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:05:54.131939", "step": 2295, "epoch": 2 }, { "type": "loss", "content": 0.005142164416611195, "timestamp": "2025-09-04 04:05:54.152617", "step": 2296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:54.252044", "step": 2296, "epoch": 2 }, { "type": "loss", "content": 0.004444428253918886, "timestamp": "2025-09-04 04:05:54.272465", "step": 2297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:05:54.366361", "step": 2297, "epoch": 2 }, { "type": "loss", "content": 0.008370397612452507, "timestamp": "2025-09-04 04:05:54.383416", "step": 2298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:05:54.484682", "step": 2298, "epoch": 2 }, { "type": "loss", "content": 0.020550237968564034, "timestamp": "2025-09-04 04:05:54.503493", "step": 2299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:05:54.614990", "step": 2299, "epoch": 2 }, { "type": "loss", "content": 0.027608707547187805, "timestamp": "2025-09-04 04:05:54.635981", "step": 2300, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:03.087262", "step": 2300, "epoch": 2 }, { "type": "pplx", "content": 302.3328329629629, "timestamp": "2025-09-04 04:06:03.089701", "step": 2300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:06:03.163146", "step": 2300, "epoch": 2 }, { "type": "loss", "content": 0.01112450659275055, "timestamp": "2025-09-04 04:06:03.177907", "step": 2301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:03.280100", "step": 2301, "epoch": 2 }, { "type": "loss", "content": 0.007555335760116577, "timestamp": "2025-09-04 04:06:03.298865", "step": 2302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:06:03.384788", "step": 2302, "epoch": 2 }, { "type": "loss", "content": 0.024827582761645317, "timestamp": "2025-09-04 04:06:03.400041", "step": 2303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:06:03.493523", "step": 2303, "epoch": 2 }, { "type": "loss", "content": 0.006991108413785696, "timestamp": "2025-09-04 04:06:03.511385", "step": 2304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:03.603632", "step": 2304, "epoch": 2 }, { "type": "loss", "content": 0.0047009047120809555, "timestamp": "2025-09-04 04:06:03.622610", "step": 2305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:03.714249", "step": 2305, "epoch": 2 }, { "type": "loss", "content": 0.021796375513076782, "timestamp": "2025-09-04 04:06:03.730911", "step": 2306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:03.838853", "step": 2306, "epoch": 2 }, { "type": "loss", "content": 0.01497070025652647, "timestamp": "2025-09-04 04:06:03.857863", "step": 2307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:06:03.967870", "step": 2307, "epoch": 2 }, { "type": "loss", "content": 0.010811883956193924, "timestamp": "2025-09-04 04:06:03.989029", "step": 2308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:06:04.086437", "step": 2308, "epoch": 2 }, { "type": "loss", "content": 0.007137446664273739, "timestamp": "2025-09-04 04:06:04.105083", "step": 2309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:04.208371", "step": 2309, "epoch": 2 }, { "type": "loss", "content": 0.00834791362285614, "timestamp": "2025-09-04 04:06:04.227423", "step": 2310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:04.332171", "step": 2310, "epoch": 2 }, { "type": "loss", "content": 0.008670263923704624, "timestamp": "2025-09-04 04:06:04.351382", "step": 2311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:04.458108", "step": 2311, "epoch": 2 }, { "type": "loss", "content": 0.009029252454638481, "timestamp": "2025-09-04 04:06:04.476256", "step": 2312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:04.576767", "step": 2312, "epoch": 2 }, { "type": "loss", "content": 0.004705758765339851, "timestamp": "2025-09-04 04:06:04.597508", "step": 2313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:06:04.707664", "step": 2313, "epoch": 2 }, { "type": "loss", "content": 0.0046806796453893185, "timestamp": "2025-09-04 04:06:04.727925", "step": 2314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:04.831143", "step": 2314, "epoch": 2 }, { "type": "loss", "content": 0.03220272809267044, "timestamp": "2025-09-04 04:06:04.850065", "step": 2315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:06:04.935355", "step": 2315, "epoch": 2 }, { "type": "loss", "content": 0.013215369544923306, "timestamp": "2025-09-04 04:06:04.951839", "step": 2316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:06:05.033934", "step": 2316, "epoch": 2 }, { "type": "loss", "content": 0.03970480337738991, "timestamp": "2025-09-04 04:06:05.050832", "step": 2317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:05.140919", "step": 2317, "epoch": 2 }, { "type": "loss", "content": 0.0069844783283770084, "timestamp": "2025-09-04 04:06:05.157670", "step": 2318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:05.257832", "step": 2318, "epoch": 2 }, { "type": "loss", "content": 0.006692732684314251, "timestamp": "2025-09-04 04:06:05.276581", "step": 2319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:05.380384", "step": 2319, "epoch": 2 }, { "type": "loss", "content": 0.02646188624203205, "timestamp": "2025-09-04 04:06:05.400003", "step": 2320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:13.772992", "step": 2320, "epoch": 2 }, { "type": "pplx", "content": 295.6536796467591, "timestamp": "2025-09-04 04:06:13.775137", "step": 2320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2320", "timestamp": "2025-09-04 04:06:14.126831", "step": 2320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:06:14.207180", "step": 2320, "epoch": 2 }, { "type": "loss", "content": 0.009197513572871685, "timestamp": "2025-09-04 04:06:14.223926", "step": 2321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:06:14.332576", "step": 2321, "epoch": 2 }, { "type": "loss", "content": 0.08209947496652603, "timestamp": "2025-09-04 04:06:14.352709", "step": 2322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:06:14.489174", "step": 2322, "epoch": 2 }, { "type": "loss", "content": 0.024091873317956924, "timestamp": "2025-09-04 04:06:14.515467", "step": 2323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:14.619946", "step": 2323, "epoch": 2 }, { "type": "loss", "content": 0.012969830073416233, "timestamp": "2025-09-04 04:06:14.639869", "step": 2324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:06:14.742599", "step": 2324, "epoch": 2 }, { "type": "loss", "content": 0.039127450436353683, "timestamp": "2025-09-04 04:06:14.764493", "step": 2325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:14.868050", "step": 2325, "epoch": 2 }, { "type": "loss", "content": 0.004493629559874535, "timestamp": "2025-09-04 04:06:14.887300", "step": 2326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:14.985953", "step": 2326, "epoch": 2 }, { "type": "loss", "content": 0.005800614599138498, "timestamp": "2025-09-04 04:06:15.004519", "step": 2327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:06:15.086700", "step": 2327, "epoch": 2 }, { "type": "loss", "content": 0.05111802741885185, "timestamp": "2025-09-04 04:06:15.102670", "step": 2328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:06:15.208703", "step": 2328, "epoch": 2 }, { "type": "loss", "content": 0.0008950461633503437, "timestamp": "2025-09-04 04:06:15.231340", "step": 2329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:06:15.325018", "step": 2329, "epoch": 2 }, { "type": "loss", "content": 0.0029820986092090607, "timestamp": "2025-09-04 04:06:15.342129", "step": 2330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:15.442186", "step": 2330, "epoch": 2 }, { "type": "loss", "content": 0.014196853153407574, "timestamp": "2025-09-04 04:06:15.461081", "step": 2331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 04:06:15.634661", "step": 2331, "epoch": 2 }, { "type": "loss", "content": 0.0035891346633434296, "timestamp": "2025-09-04 04:06:15.668205", "step": 2332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:06:15.785753", "step": 2332, "epoch": 2 }, { "type": "loss", "content": 0.0016923088114708662, "timestamp": "2025-09-04 04:06:15.808308", "step": 2333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:15.910974", "step": 2333, "epoch": 2 }, { "type": "loss", "content": 0.0015629819827154279, "timestamp": "2025-09-04 04:06:15.930249", "step": 2334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:06:16.007674", "step": 2334, "epoch": 2 }, { "type": "loss", "content": 0.023518525063991547, "timestamp": "2025-09-04 04:06:16.021688", "step": 2335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:16.124739", "step": 2335, "epoch": 2 }, { "type": "loss", "content": 0.0419270396232605, "timestamp": "2025-09-04 04:06:16.144731", "step": 2336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:16.245748", "step": 2336, "epoch": 2 }, { "type": "loss", "content": 0.005480342078953981, "timestamp": "2025-09-04 04:06:16.266752", "step": 2337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:06:16.377638", "step": 2337, "epoch": 2 }, { "type": "loss", "content": 0.024341512471437454, "timestamp": "2025-09-04 04:06:16.398272", "step": 2338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:16.499844", "step": 2338, "epoch": 2 }, { "type": "loss", "content": 0.017390906810760498, "timestamp": "2025-09-04 04:06:16.518835", "step": 2339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:16.619506", "step": 2339, "epoch": 2 }, { "type": "loss", "content": 0.024550272151827812, "timestamp": "2025-09-04 04:06:16.639169", "step": 2340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:25.127410", "step": 2340, "epoch": 2 }, { "type": "pplx", "content": 295.22586601571055, "timestamp": "2025-09-04 04:06:25.130299", "step": 2340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:06:25.206476", "step": 2340, "epoch": 2 }, { "type": "loss", "content": 0.003865597303956747, "timestamp": "2025-09-04 04:06:25.221563", "step": 2341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:25.325472", "step": 2341, "epoch": 2 }, { "type": "loss", "content": 0.023083921521902084, "timestamp": "2025-09-04 04:06:25.344427", "step": 2342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 04:06:25.474558", "step": 2342, "epoch": 2 }, { "type": "loss", "content": 0.0006907058414071798, "timestamp": "2025-09-04 04:06:25.497944", "step": 2343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:25.591178", "step": 2343, "epoch": 2 }, { "type": "loss", "content": 0.01256527565419674, "timestamp": "2025-09-04 04:06:25.608450", "step": 2344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1008 ], "flops": 20160122450880.0 }, "timestamp": "2025-09-04 04:06:25.751962", "step": 2344, "epoch": 2 }, { "type": "loss", "content": 0.0006791522027924657, "timestamp": "2025-09-04 04:06:25.782760", "step": 2345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:25.888362", "step": 2345, "epoch": 2 }, { "type": "loss", "content": 0.006552206818014383, "timestamp": "2025-09-04 04:06:25.907296", "step": 2346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:06:26.018399", "step": 2346, "epoch": 2 }, { "type": "loss", "content": 0.009997514076530933, "timestamp": "2025-09-04 04:06:26.038487", "step": 2347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:26.138952", "step": 2347, "epoch": 2 }, { "type": "loss", "content": 0.028129128739237785, "timestamp": "2025-09-04 04:06:26.158067", "step": 2348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:26.260093", "step": 2348, "epoch": 2 }, { "type": "loss", "content": 0.0038631020579487085, "timestamp": "2025-09-04 04:06:26.281059", "step": 2349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:26.385515", "step": 2349, "epoch": 2 }, { "type": "loss", "content": 0.00890912301838398, "timestamp": "2025-09-04 04:06:26.404418", "step": 2350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:06:26.487101", "step": 2350, "epoch": 2 }, { "type": "loss", "content": 0.0030983267351984978, "timestamp": "2025-09-04 04:06:26.500988", "step": 2351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:26.592798", "step": 2351, "epoch": 2 }, { "type": "loss", "content": 0.04077935963869095, "timestamp": "2025-09-04 04:06:26.610095", "step": 2352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:06:26.726858", "step": 2352, "epoch": 2 }, { "type": "loss", "content": 0.0391593798995018, "timestamp": "2025-09-04 04:06:26.750833", "step": 2353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:26.846116", "step": 2353, "epoch": 2 }, { "type": "loss", "content": 0.0036758643109351397, "timestamp": "2025-09-04 04:06:26.863271", "step": 2354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:06:26.940414", "step": 2354, "epoch": 2 }, { "type": "loss", "content": 0.0077694314531981945, "timestamp": "2025-09-04 04:06:26.953721", "step": 2355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:27.057469", "step": 2355, "epoch": 2 }, { "type": "loss", "content": 0.035263847559690475, "timestamp": "2025-09-04 04:06:27.077180", "step": 2356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:27.176464", "step": 2356, "epoch": 2 }, { "type": "loss", "content": 0.0017548573669046164, "timestamp": "2025-09-04 04:06:27.196622", "step": 2357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:06:27.333584", "step": 2357, "epoch": 2 }, { "type": "loss", "content": 0.0036123136524111032, "timestamp": "2025-09-04 04:06:27.359290", "step": 2358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:27.462130", "step": 2358, "epoch": 2 }, { "type": "loss", "content": 0.026867439970374107, "timestamp": "2025-09-04 04:06:27.480803", "step": 2359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:06:27.571768", "step": 2359, "epoch": 2 }, { "type": "loss", "content": 0.011943703517317772, "timestamp": "2025-09-04 04:06:27.587944", "step": 2360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:36.064337", "step": 2360, "epoch": 2 }, { "type": "pplx", "content": 298.06816808696493, "timestamp": "2025-09-04 04:06:36.066522", "step": 2360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2360", "timestamp": "2025-09-04 04:06:36.409656", "step": 2360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:36.509058", "step": 2360, "epoch": 2 }, { "type": "loss", "content": 0.01579303853213787, "timestamp": "2025-09-04 04:06:36.530166", "step": 2361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:06:36.608506", "step": 2361, "epoch": 2 }, { "type": "loss", "content": 0.0023343523498624563, "timestamp": "2025-09-04 04:06:36.622543", "step": 2362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:06:36.700242", "step": 2362, "epoch": 2 }, { "type": "loss", "content": 0.030650924891233444, "timestamp": "2025-09-04 04:06:36.714227", "step": 2363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:06:36.824118", "step": 2363, "epoch": 2 }, { "type": "loss", "content": 0.0008962932624854147, "timestamp": "2025-09-04 04:06:36.845479", "step": 2364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:06:36.922413", "step": 2364, "epoch": 2 }, { "type": "loss", "content": 0.008728813380002975, "timestamp": "2025-09-04 04:06:36.937914", "step": 2365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:37.041318", "step": 2365, "epoch": 2 }, { "type": "loss", "content": 0.0025963473599404097, "timestamp": "2025-09-04 04:06:37.060611", "step": 2366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:06:37.167037", "step": 2366, "epoch": 2 }, { "type": "loss", "content": 0.006974723190069199, "timestamp": "2025-09-04 04:06:37.187013", "step": 2367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:37.291474", "step": 2367, "epoch": 2 }, { "type": "loss", "content": 0.026150088757276535, "timestamp": "2025-09-04 04:06:37.311513", "step": 2368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:06:37.416575", "step": 2368, "epoch": 2 }, { "type": "loss", "content": 0.033544786274433136, "timestamp": "2025-09-04 04:06:37.438548", "step": 2369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:37.530032", "step": 2369, "epoch": 2 }, { "type": "loss", "content": 0.0012710822047665715, "timestamp": "2025-09-04 04:06:37.546779", "step": 2370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:37.640113", "step": 2370, "epoch": 2 }, { "type": "loss", "content": 0.004774425644427538, "timestamp": "2025-09-04 04:06:37.657431", "step": 2371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:37.748688", "step": 2371, "epoch": 2 }, { "type": "loss", "content": 0.010192320682108402, "timestamp": "2025-09-04 04:06:37.766189", "step": 2372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:06:37.857984", "step": 2372, "epoch": 2 }, { "type": "loss", "content": 0.047795332968235016, "timestamp": "2025-09-04 04:06:37.877161", "step": 2373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:06:37.960434", "step": 2373, "epoch": 2 }, { "type": "loss", "content": 0.018767505884170532, "timestamp": "2025-09-04 04:06:37.975653", "step": 2374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:38.073920", "step": 2374, "epoch": 2 }, { "type": "loss", "content": 0.017257632687687874, "timestamp": "2025-09-04 04:06:38.092428", "step": 2375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:06:38.194437", "step": 2375, "epoch": 2 }, { "type": "loss", "content": 0.007572493981570005, "timestamp": "2025-09-04 04:06:38.214336", "step": 2376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:38.313762", "step": 2376, "epoch": 2 }, { "type": "loss", "content": 0.0060096923261880875, "timestamp": "2025-09-04 04:06:38.334427", "step": 2377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:06:38.421155", "step": 2377, "epoch": 2 }, { "type": "loss", "content": 0.02415098063647747, "timestamp": "2025-09-04 04:06:38.436761", "step": 2378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:38.530537", "step": 2378, "epoch": 2 }, { "type": "loss", "content": 0.016814231872558594, "timestamp": "2025-09-04 04:06:38.547930", "step": 2379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:38.651752", "step": 2379, "epoch": 2 }, { "type": "loss", "content": 0.0010228022001683712, "timestamp": "2025-09-04 04:06:38.671811", "step": 2380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:47.058540", "step": 2380, "epoch": 2 }, { "type": "pplx", "content": 300.2741385956129, "timestamp": "2025-09-04 04:06:47.060601", "step": 2380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:47.156440", "step": 2380, "epoch": 2 }, { "type": "loss", "content": 0.05420851334929466, "timestamp": "2025-09-04 04:06:47.177116", "step": 2381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:06:47.282495", "step": 2381, "epoch": 2 }, { "type": "loss", "content": 0.013719238340854645, "timestamp": "2025-09-04 04:06:47.302180", "step": 2382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:47.405329", "step": 2382, "epoch": 2 }, { "type": "loss", "content": 0.01382434368133545, "timestamp": "2025-09-04 04:06:47.424573", "step": 2383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1424 ], "flops": 28480172958272.0 }, "timestamp": "2025-09-04 04:06:47.635091", "step": 2383, "epoch": 2 }, { "type": "loss", "content": 0.017751427367329597, "timestamp": "2025-09-04 04:06:47.676429", "step": 2384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:47.773764", "step": 2384, "epoch": 2 }, { "type": "loss", "content": 0.026963358744978905, "timestamp": "2025-09-04 04:06:47.794292", "step": 2385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:47.884251", "step": 2385, "epoch": 2 }, { "type": "loss", "content": 0.0162715595215559, "timestamp": "2025-09-04 04:06:47.901019", "step": 2386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:47.994726", "step": 2386, "epoch": 2 }, { "type": "loss", "content": 0.009239349514245987, "timestamp": "2025-09-04 04:06:48.012230", "step": 2387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:48.114990", "step": 2387, "epoch": 2 }, { "type": "loss", "content": 0.02234196476638317, "timestamp": "2025-09-04 04:06:48.135143", "step": 2388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:06:48.216079", "step": 2388, "epoch": 2 }, { "type": "loss", "content": 0.031179826706647873, "timestamp": "2025-09-04 04:06:48.232525", "step": 2389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:06:48.340364", "step": 2389, "epoch": 2 }, { "type": "loss", "content": 0.025329116731882095, "timestamp": "2025-09-04 04:06:48.360594", "step": 2390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:48.461110", "step": 2390, "epoch": 2 }, { "type": "loss", "content": 0.022294968366622925, "timestamp": "2025-09-04 04:06:48.480072", "step": 2391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:06:48.576529", "step": 2391, "epoch": 2 }, { "type": "loss", "content": 0.0785910040140152, "timestamp": "2025-09-04 04:06:48.594808", "step": 2392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:06:48.668726", "step": 2392, "epoch": 2 }, { "type": "loss", "content": 0.024669643491506577, "timestamp": "2025-09-04 04:06:48.683548", "step": 2393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:06:48.782655", "step": 2393, "epoch": 2 }, { "type": "loss", "content": 0.023527929559350014, "timestamp": "2025-09-04 04:06:48.801350", "step": 2394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:06:48.917923", "step": 2394, "epoch": 2 }, { "type": "loss", "content": 0.01429518312215805, "timestamp": "2025-09-04 04:06:48.940049", "step": 2395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:49.043345", "step": 2395, "epoch": 2 }, { "type": "loss", "content": 0.003827124135568738, "timestamp": "2025-09-04 04:06:49.063366", "step": 2396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:06:49.204787", "step": 2396, "epoch": 2 }, { "type": "loss", "content": 0.010013996623456478, "timestamp": "2025-09-04 04:06:49.235597", "step": 2397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:06:49.343503", "step": 2397, "epoch": 2 }, { "type": "loss", "content": 0.011067106388509274, "timestamp": "2025-09-04 04:06:49.363807", "step": 2398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:06:49.460343", "step": 2398, "epoch": 2 }, { "type": "loss", "content": 0.000826965959277004, "timestamp": "2025-09-04 04:06:49.477861", "step": 2399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:06:49.572217", "step": 2399, "epoch": 2 }, { "type": "loss", "content": 0.03683660551905632, "timestamp": "2025-09-04 04:06:49.590419", "step": 2400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:06:57.987370", "step": 2400, "epoch": 2 }, { "type": "pplx", "content": 299.87191027633986, "timestamp": "2025-09-04 04:06:57.989481", "step": 2400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2400", "timestamp": "2025-09-04 04:06:58.497954", "step": 2400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:58.598527", "step": 2400, "epoch": 2 }, { "type": "loss", "content": 0.0408879779279232, "timestamp": "2025-09-04 04:06:58.619076", "step": 2401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:06:58.763688", "step": 2401, "epoch": 2 }, { "type": "loss", "content": 0.07916318625211716, "timestamp": "2025-09-04 04:06:58.783853", "step": 2402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:06:58.913911", "step": 2402, "epoch": 2 }, { "type": "loss", "content": 0.009972754865884781, "timestamp": "2025-09-04 04:06:58.933370", "step": 2403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:06:59.098106", "step": 2403, "epoch": 2 }, { "type": "loss", "content": 0.005085110664367676, "timestamp": "2025-09-04 04:06:59.120132", "step": 2404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:06:59.303364", "step": 2404, "epoch": 2 }, { "type": "loss", "content": 0.006083235610276461, "timestamp": "2025-09-04 04:06:59.325785", "step": 2405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:06:59.433171", "step": 2405, "epoch": 2 }, { "type": "loss", "content": 0.018005046993494034, "timestamp": "2025-09-04 04:06:59.450765", "step": 2406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:06:59.565299", "step": 2406, "epoch": 2 }, { "type": "loss", "content": 0.0005014762282371521, "timestamp": "2025-09-04 04:06:59.584616", "step": 2407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:06:59.661630", "step": 2407, "epoch": 2 }, { "type": "loss", "content": 0.008235539309680462, "timestamp": "2025-09-04 04:06:59.675910", "step": 2408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:06:59.818065", "step": 2408, "epoch": 2 }, { "type": "loss", "content": 0.010737213306128979, "timestamp": "2025-09-04 04:06:59.837867", "step": 2409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:00.009283", "step": 2409, "epoch": 2 }, { "type": "loss", "content": 0.03348386660218239, "timestamp": "2025-09-04 04:07:00.028112", "step": 2410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:00.148227", "step": 2410, "epoch": 2 }, { "type": "loss", "content": 0.003203654196113348, "timestamp": "2025-09-04 04:07:00.167426", "step": 2411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:00.280154", "step": 2411, "epoch": 2 }, { "type": "loss", "content": 0.012616422958672047, "timestamp": "2025-09-04 04:07:00.301563", "step": 2412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:07:00.386966", "step": 2412, "epoch": 2 }, { "type": "loss", "content": 0.011091896332800388, "timestamp": "2025-09-04 04:07:00.403586", "step": 2413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 04:07:00.586895", "step": 2413, "epoch": 2 }, { "type": "loss", "content": 0.012619656510651112, "timestamp": "2025-09-04 04:07:00.620884", "step": 2414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:00.732799", "step": 2414, "epoch": 2 }, { "type": "loss", "content": 0.02890246920287609, "timestamp": "2025-09-04 04:07:00.751815", "step": 2415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:00.890554", "step": 2415, "epoch": 2 }, { "type": "loss", "content": 0.023879971355199814, "timestamp": "2025-09-04 04:07:00.911487", "step": 2416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:01.035963", "step": 2416, "epoch": 2 }, { "type": "loss", "content": 0.009976472705602646, "timestamp": "2025-09-04 04:07:01.056672", "step": 2417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:07:01.187998", "step": 2417, "epoch": 2 }, { "type": "loss", "content": 0.038926564157009125, "timestamp": "2025-09-04 04:07:01.201531", "step": 2418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:07:01.314823", "step": 2418, "epoch": 2 }, { "type": "loss", "content": 0.01832580380141735, "timestamp": "2025-09-04 04:07:01.332331", "step": 2419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:01.421325", "step": 2419, "epoch": 2 }, { "type": "loss", "content": 0.00956509169191122, "timestamp": "2025-09-04 04:07:01.437538", "step": 2420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:07:09.904505", "step": 2420, "epoch": 2 }, { "type": "pplx", "content": 303.06925253168487, "timestamp": "2025-09-04 04:07:09.907076", "step": 2420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:07:09.979738", "step": 2420, "epoch": 2 }, { "type": "loss", "content": 0.037451133131980896, "timestamp": "2025-09-04 04:07:09.994374", "step": 2421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:07:10.089818", "step": 2421, "epoch": 2 }, { "type": "loss", "content": 0.013809522613883018, "timestamp": "2025-09-04 04:07:10.107208", "step": 2422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:10.209562", "step": 2422, "epoch": 2 }, { "type": "loss", "content": 0.03779786825180054, "timestamp": "2025-09-04 04:07:10.228790", "step": 2423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:10.316057", "step": 2423, "epoch": 2 }, { "type": "loss", "content": 0.013304756954312325, "timestamp": "2025-09-04 04:07:10.332431", "step": 2424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1232 ], "flops": 24640149647168.0 }, "timestamp": "2025-09-04 04:07:10.511846", "step": 2424, "epoch": 2 }, { "type": "loss", "content": 0.04832831397652626, "timestamp": "2025-09-04 04:07:10.549441", "step": 2425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:10.636716", "step": 2425, "epoch": 2 }, { "type": "loss", "content": 0.00974750891327858, "timestamp": "2025-09-04 04:07:10.652143", "step": 2426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:10.756272", "step": 2426, "epoch": 2 }, { "type": "loss", "content": 0.04315938055515289, "timestamp": "2025-09-04 04:07:10.775526", "step": 2427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:10.874657", "step": 2427, "epoch": 2 }, { "type": "loss", "content": 0.00986342690885067, "timestamp": "2025-09-04 04:07:10.894019", "step": 2428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:07:11.028846", "step": 2428, "epoch": 2 }, { "type": "loss", "content": 0.0031775757670402527, "timestamp": "2025-09-04 04:07:11.057582", "step": 2429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:11.143954", "step": 2429, "epoch": 2 }, { "type": "loss", "content": 0.024582451209425926, "timestamp": "2025-09-04 04:07:11.159488", "step": 2430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:07:11.269272", "step": 2430, "epoch": 2 }, { "type": "loss", "content": 0.09256591647863388, "timestamp": "2025-09-04 04:07:11.289807", "step": 2431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:11.391712", "step": 2431, "epoch": 2 }, { "type": "loss", "content": 0.014577627182006836, "timestamp": "2025-09-04 04:07:11.411644", "step": 2432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:07:11.501022", "step": 2432, "epoch": 2 }, { "type": "loss", "content": 0.0009501639287918806, "timestamp": "2025-09-04 04:07:11.519394", "step": 2433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:11.622587", "step": 2433, "epoch": 2 }, { "type": "loss", "content": 0.022495364770293236, "timestamp": "2025-09-04 04:07:11.641457", "step": 2434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:07:11.718910", "step": 2434, "epoch": 2 }, { "type": "loss", "content": 0.013360848650336266, "timestamp": "2025-09-04 04:07:11.732852", "step": 2435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:11.818320", "step": 2435, "epoch": 2 }, { "type": "loss", "content": 0.006631570402532816, "timestamp": "2025-09-04 04:07:11.834542", "step": 2436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:11.933817", "step": 2436, "epoch": 2 }, { "type": "loss", "content": 0.004165567457675934, "timestamp": "2025-09-04 04:07:11.954489", "step": 2437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:12.059057", "step": 2437, "epoch": 2 }, { "type": "loss", "content": 0.002891003619879484, "timestamp": "2025-09-04 04:07:12.078289", "step": 2438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:12.177201", "step": 2438, "epoch": 2 }, { "type": "loss", "content": 0.00025852315593510866, "timestamp": "2025-09-04 04:07:12.195858", "step": 2439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 04:07:12.333048", "step": 2439, "epoch": 2 }, { "type": "loss", "content": 0.0613558404147625, "timestamp": "2025-09-04 04:07:12.360058", "step": 2440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:07:20.744500", "step": 2440, "epoch": 2 }, { "type": "pplx", "content": 310.24100791067593, "timestamp": "2025-09-04 04:07:20.746658", "step": 2440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2440", "timestamp": "2025-09-04 04:07:21.256656", "step": 2440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:07:21.373924", "step": 2440, "epoch": 2 }, { "type": "loss", "content": 0.0030447612516582012, "timestamp": "2025-09-04 04:07:21.399154", "step": 2441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:07:21.602494", "step": 2441, "epoch": 2 }, { "type": "loss", "content": 0.028010720387101173, "timestamp": "2025-09-04 04:07:21.641816", "step": 2442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:21.751643", "step": 2442, "epoch": 2 }, { "type": "loss", "content": 0.00379212130792439, "timestamp": "2025-09-04 04:07:21.772269", "step": 2443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:07:21.866937", "step": 2443, "epoch": 2 }, { "type": "loss", "content": 0.0061768838204443455, "timestamp": "2025-09-04 04:07:21.885100", "step": 2444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:07:21.975206", "step": 2444, "epoch": 2 }, { "type": "loss", "content": 0.011033882386982441, "timestamp": "2025-09-04 04:07:21.994027", "step": 2445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:07:22.104069", "step": 2445, "epoch": 2 }, { "type": "loss", "content": 0.01739361509680748, "timestamp": "2025-09-04 04:07:22.124669", "step": 2446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:22.234369", "step": 2446, "epoch": 2 }, { "type": "loss", "content": 0.020273465663194656, "timestamp": "2025-09-04 04:07:22.255184", "step": 2447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:22.357414", "step": 2447, "epoch": 2 }, { "type": "loss", "content": 0.021363025531172752, "timestamp": "2025-09-04 04:07:22.377554", "step": 2448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:07:22.470270", "step": 2448, "epoch": 2 }, { "type": "loss", "content": 0.012352973222732544, "timestamp": "2025-09-04 04:07:22.489610", "step": 2449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:22.574849", "step": 2449, "epoch": 2 }, { "type": "loss", "content": 0.009310700930655003, "timestamp": "2025-09-04 04:07:22.590465", "step": 2450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:07:22.681235", "step": 2450, "epoch": 2 }, { "type": "loss", "content": 0.012666204944252968, "timestamp": "2025-09-04 04:07:22.698051", "step": 2451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:22.782113", "step": 2451, "epoch": 2 }, { "type": "loss", "content": 0.002646137960255146, "timestamp": "2025-09-04 04:07:22.798362", "step": 2452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:07:22.889307", "step": 2452, "epoch": 2 }, { "type": "loss", "content": 0.03072880022227764, "timestamp": "2025-09-04 04:07:22.908543", "step": 2453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:23.012169", "step": 2453, "epoch": 2 }, { "type": "loss", "content": 0.01461123675107956, "timestamp": "2025-09-04 04:07:23.031423", "step": 2454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:23.143359", "step": 2454, "epoch": 2 }, { "type": "loss", "content": 0.010309387929737568, "timestamp": "2025-09-04 04:07:23.164090", "step": 2455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:23.267195", "step": 2455, "epoch": 2 }, { "type": "loss", "content": 0.07364504784345627, "timestamp": "2025-09-04 04:07:23.287265", "step": 2456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:23.371287", "step": 2456, "epoch": 2 }, { "type": "loss", "content": 0.028103487566113472, "timestamp": "2025-09-04 04:07:23.388274", "step": 2457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:07:23.516938", "step": 2457, "epoch": 2 }, { "type": "loss", "content": 0.03834032267332077, "timestamp": "2025-09-04 04:07:23.540149", "step": 2458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:23.651140", "step": 2458, "epoch": 2 }, { "type": "loss", "content": 0.0072427773848176, "timestamp": "2025-09-04 04:07:23.671797", "step": 2459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:23.770961", "step": 2459, "epoch": 2 }, { "type": "loss", "content": 0.0034062564373016357, "timestamp": "2025-09-04 04:07:23.790327", "step": 2460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:07:32.212762", "step": 2460, "epoch": 2 }, { "type": "pplx", "content": 315.36553546678925, "timestamp": "2025-09-04 04:07:32.215085", "step": 2460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:07:32.320973", "step": 2460, "epoch": 2 }, { "type": "loss", "content": 0.0368022620677948, "timestamp": "2025-09-04 04:07:32.343546", "step": 2461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:32.446908", "step": 2461, "epoch": 2 }, { "type": "loss", "content": 0.014917043037712574, "timestamp": "2025-09-04 04:07:32.466067", "step": 2462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:07:32.543207", "step": 2462, "epoch": 2 }, { "type": "loss", "content": 0.07748901844024658, "timestamp": "2025-09-04 04:07:32.557180", "step": 2463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:32.661303", "step": 2463, "epoch": 2 }, { "type": "loss", "content": 0.006906221155077219, "timestamp": "2025-09-04 04:07:32.681071", "step": 2464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:32.777954", "step": 2464, "epoch": 2 }, { "type": "loss", "content": 0.03316435217857361, "timestamp": "2025-09-04 04:07:32.798213", "step": 2465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:07:32.892126", "step": 2465, "epoch": 2 }, { "type": "loss", "content": 0.0017650446388870478, "timestamp": "2025-09-04 04:07:32.909510", "step": 2466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:07:33.055191", "step": 2466, "epoch": 2 }, { "type": "loss", "content": 0.010998114012181759, "timestamp": "2025-09-04 04:07:33.083267", "step": 2467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:07:33.153289", "step": 2467, "epoch": 2 }, { "type": "loss", "content": 0.00827596615999937, "timestamp": "2025-09-04 04:07:33.166670", "step": 2468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:33.267008", "step": 2468, "epoch": 2 }, { "type": "loss", "content": 0.010986930690705776, "timestamp": "2025-09-04 04:07:33.288172", "step": 2469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:07:33.398361", "step": 2469, "epoch": 2 }, { "type": "loss", "content": 0.002616006415337324, "timestamp": "2025-09-04 04:07:33.418487", "step": 2470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 04:07:33.581804", "step": 2470, "epoch": 2 }, { "type": "loss", "content": 0.010370907373726368, "timestamp": "2025-09-04 04:07:33.613941", "step": 2471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:07:33.700684", "step": 2471, "epoch": 2 }, { "type": "loss", "content": 0.018909158185124397, "timestamp": "2025-09-04 04:07:33.716679", "step": 2472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:33.821067", "step": 2472, "epoch": 2 }, { "type": "loss", "content": 0.03325289860367775, "timestamp": "2025-09-04 04:07:33.842023", "step": 2473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:07:33.912109", "step": 2473, "epoch": 2 }, { "type": "loss", "content": 0.0038535459898412228, "timestamp": "2025-09-04 04:07:33.924635", "step": 2474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:34.035300", "step": 2474, "epoch": 2 }, { "type": "loss", "content": 0.03316061198711395, "timestamp": "2025-09-04 04:07:34.055652", "step": 2475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:34.148736", "step": 2475, "epoch": 2 }, { "type": "loss", "content": 0.042802825570106506, "timestamp": "2025-09-04 04:07:34.164952", "step": 2476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:34.265241", "step": 2476, "epoch": 2 }, { "type": "loss", "content": 0.013982797972857952, "timestamp": "2025-09-04 04:07:34.286061", "step": 2477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:34.388236", "step": 2477, "epoch": 2 }, { "type": "loss", "content": 0.00205409643240273, "timestamp": "2025-09-04 04:07:34.406622", "step": 2478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:07:34.489076", "step": 2478, "epoch": 2 }, { "type": "loss", "content": 0.0019991924054920673, "timestamp": "2025-09-04 04:07:34.503233", "step": 2479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:07:34.612716", "step": 2479, "epoch": 2 }, { "type": "loss", "content": 0.03043895773589611, "timestamp": "2025-09-04 04:07:34.633505", "step": 2480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:07:43.012781", "step": 2480, "epoch": 2 }, { "type": "pplx", "content": 318.30207618622006, "timestamp": "2025-09-04 04:07:43.014765", "step": 2480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2480", "timestamp": "2025-09-04 04:07:43.373304", "step": 2480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:43.456246", "step": 2480, "epoch": 2 }, { "type": "loss", "content": 0.023624232038855553, "timestamp": "2025-09-04 04:07:43.473367", "step": 2481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:07:43.567295", "step": 2481, "epoch": 2 }, { "type": "loss", "content": 0.028039980679750443, "timestamp": "2025-09-04 04:07:43.584815", "step": 2482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:07:43.691965", "step": 2482, "epoch": 2 }, { "type": "loss", "content": 0.04251798987388611, "timestamp": "2025-09-04 04:07:43.712226", "step": 2483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:07:43.788241", "step": 2483, "epoch": 2 }, { "type": "loss", "content": 0.013368581421673298, "timestamp": "2025-09-04 04:07:43.802722", "step": 2484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:07:43.906725", "step": 2484, "epoch": 2 }, { "type": "loss", "content": 0.00481030810624361, "timestamp": "2025-09-04 04:07:43.928625", "step": 2485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:44.039992", "step": 2485, "epoch": 2 }, { "type": "loss", "content": 0.03684372827410698, "timestamp": "2025-09-04 04:07:44.060654", "step": 2486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:07:44.153521", "step": 2486, "epoch": 2 }, { "type": "loss", "content": 0.005444700364023447, "timestamp": "2025-09-04 04:07:44.170668", "step": 2487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:44.270187", "step": 2487, "epoch": 2 }, { "type": "loss", "content": 0.0024230824783444405, "timestamp": "2025-09-04 04:07:44.289865", "step": 2488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1088 ], "flops": 21760132163840.0 }, "timestamp": "2025-09-04 04:07:44.443208", "step": 2488, "epoch": 2 }, { "type": "loss", "content": 0.026805497705936432, "timestamp": "2025-09-04 04:07:44.476712", "step": 2489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:44.579744", "step": 2489, "epoch": 2 }, { "type": "loss", "content": 0.02435290813446045, "timestamp": "2025-09-04 04:07:44.599073", "step": 2490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:07:44.692908", "step": 2490, "epoch": 2 }, { "type": "loss", "content": 0.0021042758598923683, "timestamp": "2025-09-04 04:07:44.710316", "step": 2491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:07:44.819017", "step": 2491, "epoch": 2 }, { "type": "loss", "content": 0.005590126849710941, "timestamp": "2025-09-04 04:07:44.840113", "step": 2492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:07:44.923546", "step": 2492, "epoch": 2 }, { "type": "loss", "content": 0.008740575052797794, "timestamp": "2025-09-04 04:07:44.940517", "step": 2493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:45.026735", "step": 2493, "epoch": 2 }, { "type": "loss", "content": 0.024248680099844933, "timestamp": "2025-09-04 04:07:45.042313", "step": 2494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:45.142261", "step": 2494, "epoch": 2 }, { "type": "loss", "content": 0.028292085975408554, "timestamp": "2025-09-04 04:07:45.161082", "step": 2495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:07:45.255265", "step": 2495, "epoch": 2 }, { "type": "loss", "content": 0.014871833845973015, "timestamp": "2025-09-04 04:07:45.273507", "step": 2496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:45.374285", "step": 2496, "epoch": 2 }, { "type": "loss", "content": 0.017097758129239082, "timestamp": "2025-09-04 04:07:45.395462", "step": 2497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:07:45.515770", "step": 2497, "epoch": 2 }, { "type": "loss", "content": 0.014303861185908318, "timestamp": "2025-09-04 04:07:45.537472", "step": 2498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:07:45.628732", "step": 2498, "epoch": 2 }, { "type": "loss", "content": 0.0032022215891629457, "timestamp": "2025-09-04 04:07:45.645560", "step": 2499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:07:45.740982", "step": 2499, "epoch": 2 }, { "type": "loss", "content": 0.0452834852039814, "timestamp": "2025-09-04 04:07:45.759178", "step": 2500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:07:54.128547", "step": 2500, "epoch": 2 }, { "type": "pplx", "content": 319.347960568569, "timestamp": "2025-09-04 04:07:54.130884", "step": 2500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 04:07:54.190519", "step": 2500, "epoch": 2 }, { "type": "loss", "content": 0.007545833010226488, "timestamp": "2025-09-04 04:07:54.202250", "step": 2501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:07:54.284283", "step": 2501, "epoch": 2 }, { "type": "loss", "content": 0.008382921107113361, "timestamp": "2025-09-04 04:07:54.299354", "step": 2502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 04:07:54.428234", "step": 2502, "epoch": 2 }, { "type": "loss", "content": 0.010973788797855377, "timestamp": "2025-09-04 04:07:54.452789", "step": 2503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:54.538660", "step": 2503, "epoch": 2 }, { "type": "loss", "content": 0.04710651561617851, "timestamp": "2025-09-04 04:07:54.555139", "step": 2504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:54.661887", "step": 2504, "epoch": 2 }, { "type": "loss", "content": 0.00142197054810822, "timestamp": "2025-09-04 04:07:54.684651", "step": 2505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:07:54.794898", "step": 2505, "epoch": 2 }, { "type": "loss", "content": 0.04192418232560158, "timestamp": "2025-09-04 04:07:54.815339", "step": 2506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:54.918276", "step": 2506, "epoch": 2 }, { "type": "loss", "content": 0.024896910414099693, "timestamp": "2025-09-04 04:07:54.937467", "step": 2507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:55.038731", "step": 2507, "epoch": 2 }, { "type": "loss", "content": 0.02149958163499832, "timestamp": "2025-09-04 04:07:55.058382", "step": 2508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:07:55.145206", "step": 2508, "epoch": 2 }, { "type": "loss", "content": 0.012380536645650864, "timestamp": "2025-09-04 04:07:55.160522", "step": 2509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:07:55.270997", "step": 2509, "epoch": 2 }, { "type": "loss", "content": 0.0016803938196972013, "timestamp": "2025-09-04 04:07:55.291308", "step": 2510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:07:55.396245", "step": 2510, "epoch": 2 }, { "type": "loss", "content": 0.015394407324492931, "timestamp": "2025-09-04 04:07:55.415316", "step": 2511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:55.518957", "step": 2511, "epoch": 2 }, { "type": "loss", "content": 0.005977288819849491, "timestamp": "2025-09-04 04:07:55.538996", "step": 2512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:07:55.626412", "step": 2512, "epoch": 2 }, { "type": "loss", "content": 0.01742619276046753, "timestamp": "2025-09-04 04:07:55.644714", "step": 2513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:55.747687", "step": 2513, "epoch": 2 }, { "type": "loss", "content": 0.006442517042160034, "timestamp": "2025-09-04 04:07:55.767073", "step": 2514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:55.866978", "step": 2514, "epoch": 2 }, { "type": "loss", "content": 0.013937929645180702, "timestamp": "2025-09-04 04:07:55.885738", "step": 2515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:07:55.987972", "step": 2515, "epoch": 2 }, { "type": "loss", "content": 0.005017734598368406, "timestamp": "2025-09-04 04:07:56.007994", "step": 2516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:07:56.125464", "step": 2516, "epoch": 2 }, { "type": "loss", "content": 0.002909827046096325, "timestamp": "2025-09-04 04:07:56.150788", "step": 2517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:07:56.236953", "step": 2517, "epoch": 2 }, { "type": "loss", "content": 0.0124394865706563, "timestamp": "2025-09-04 04:07:56.252576", "step": 2518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:07:56.352113", "step": 2518, "epoch": 2 }, { "type": "loss", "content": 0.00413033552467823, "timestamp": "2025-09-04 04:07:56.370492", "step": 2519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:07:56.470526", "step": 2519, "epoch": 2 }, { "type": "loss", "content": 0.044936034828424454, "timestamp": "2025-09-04 04:07:56.490181", "step": 2520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:04.841099", "step": 2520, "epoch": 2 }, { "type": "pplx", "content": 321.3946038942604, "timestamp": "2025-09-04 04:08:04.843128", "step": 2520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2520", "timestamp": "2025-09-04 04:08:05.181843", "step": 2520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:05.281078", "step": 2520, "epoch": 2 }, { "type": "loss", "content": 0.004612304270267487, "timestamp": "2025-09-04 04:08:05.301969", "step": 2521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:05.397001", "step": 2521, "epoch": 2 }, { "type": "loss", "content": 0.028999704867601395, "timestamp": "2025-09-04 04:08:05.414484", "step": 2522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:05.518293", "step": 2522, "epoch": 2 }, { "type": "loss", "content": 0.006988754495978355, "timestamp": "2025-09-04 04:08:05.537400", "step": 2523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:05.636988", "step": 2523, "epoch": 2 }, { "type": "loss", "content": 0.07554414868354797, "timestamp": "2025-09-04 04:08:05.656374", "step": 2524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:05.756212", "step": 2524, "epoch": 2 }, { "type": "loss", "content": 0.004102020058780909, "timestamp": "2025-09-04 04:08:05.776850", "step": 2525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:05.885428", "step": 2525, "epoch": 2 }, { "type": "loss", "content": 0.0004931488656438887, "timestamp": "2025-09-04 04:08:05.904557", "step": 2526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:06.012440", "step": 2526, "epoch": 2 }, { "type": "loss", "content": 0.02074488066136837, "timestamp": "2025-09-04 04:08:06.032722", "step": 2527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:08:06.148947", "step": 2527, "epoch": 2 }, { "type": "loss", "content": 0.00027644523652270436, "timestamp": "2025-09-04 04:08:06.171613", "step": 2528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:06.263638", "step": 2528, "epoch": 2 }, { "type": "loss", "content": 0.0038577071391046047, "timestamp": "2025-09-04 04:08:06.282745", "step": 2529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:08:06.360604", "step": 2529, "epoch": 2 }, { "type": "loss", "content": 0.026262257248163223, "timestamp": "2025-09-04 04:08:06.374738", "step": 2530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:08:06.481073", "step": 2530, "epoch": 2 }, { "type": "loss", "content": 0.01689167320728302, "timestamp": "2025-09-04 04:08:06.500842", "step": 2531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:06.603402", "step": 2531, "epoch": 2 }, { "type": "loss", "content": 0.04883643984794617, "timestamp": "2025-09-04 04:08:06.623391", "step": 2532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:08:06.706852", "step": 2532, "epoch": 2 }, { "type": "loss", "content": 0.005458258092403412, "timestamp": "2025-09-04 04:08:06.723480", "step": 2533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:08:06.840181", "step": 2533, "epoch": 2 }, { "type": "loss", "content": 0.01368082407861948, "timestamp": "2025-09-04 04:08:06.862251", "step": 2534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:08:06.938420", "step": 2534, "epoch": 2 }, { "type": "loss", "content": 0.008800865150988102, "timestamp": "2025-09-04 04:08:06.952160", "step": 2535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:08:07.058634", "step": 2535, "epoch": 2 }, { "type": "loss", "content": 0.0017213658429682255, "timestamp": "2025-09-04 04:08:07.079073", "step": 2536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:08:07.187701", "step": 2536, "epoch": 2 }, { "type": "loss", "content": 0.024919630959630013, "timestamp": "2025-09-04 04:08:07.210278", "step": 2537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:07.303713", "step": 2537, "epoch": 2 }, { "type": "loss", "content": 0.005876360926777124, "timestamp": "2025-09-04 04:08:07.320835", "step": 2538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:08:07.405122", "step": 2538, "epoch": 2 }, { "type": "loss", "content": 0.06584823131561279, "timestamp": "2025-09-04 04:08:07.420373", "step": 2539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:07.513278", "step": 2539, "epoch": 2 }, { "type": "loss", "content": 0.01981205679476261, "timestamp": "2025-09-04 04:08:07.531141", "step": 2540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:15.901844", "step": 2540, "epoch": 2 }, { "type": "pplx", "content": 322.01266845314177, "timestamp": "2025-09-04 04:08:15.903878", "step": 2540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:08:16.008572", "step": 2540, "epoch": 2 }, { "type": "loss", "content": 0.0399375818669796, "timestamp": "2025-09-04 04:08:16.031089", "step": 2541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:08:16.140199", "step": 2541, "epoch": 2 }, { "type": "loss", "content": 0.004767777398228645, "timestamp": "2025-09-04 04:08:16.160739", "step": 2542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:16.256471", "step": 2542, "epoch": 2 }, { "type": "loss", "content": 0.02117767557501793, "timestamp": "2025-09-04 04:08:16.273940", "step": 2543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:08:16.344595", "step": 2543, "epoch": 2 }, { "type": "loss", "content": 0.010128835216164589, "timestamp": "2025-09-04 04:08:16.358124", "step": 2544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:08:16.439671", "step": 2544, "epoch": 2 }, { "type": "loss", "content": 0.0018931415397673845, "timestamp": "2025-09-04 04:08:16.456305", "step": 2545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:16.555368", "step": 2545, "epoch": 2 }, { "type": "loss", "content": 0.011107025668025017, "timestamp": "2025-09-04 04:08:16.573896", "step": 2546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:16.674332", "step": 2546, "epoch": 2 }, { "type": "loss", "content": 0.022087909281253815, "timestamp": "2025-09-04 04:08:16.693190", "step": 2547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:16.796126", "step": 2547, "epoch": 2 }, { "type": "loss", "content": 0.04829714074730873, "timestamp": "2025-09-04 04:08:16.816098", "step": 2548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:16.916158", "step": 2548, "epoch": 2 }, { "type": "loss", "content": 0.009213853627443314, "timestamp": "2025-09-04 04:08:16.936557", "step": 2549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:08:17.043596", "step": 2549, "epoch": 2 }, { "type": "loss", "content": 0.029857391491532326, "timestamp": "2025-09-04 04:08:17.063569", "step": 2550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:17.167443", "step": 2550, "epoch": 2 }, { "type": "loss", "content": 0.004018782638013363, "timestamp": "2025-09-04 04:08:17.186721", "step": 2551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:08:17.261179", "step": 2551, "epoch": 2 }, { "type": "loss", "content": 0.029023800045251846, "timestamp": "2025-09-04 04:08:17.275555", "step": 2552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:17.366789", "step": 2552, "epoch": 2 }, { "type": "loss", "content": 0.026715071871876717, "timestamp": "2025-09-04 04:08:17.388002", "step": 2553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:08:17.540389", "step": 2553, "epoch": 2 }, { "type": "loss", "content": 0.048667825758457184, "timestamp": "2025-09-04 04:08:17.560427", "step": 2554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:08:17.700200", "step": 2554, "epoch": 2 }, { "type": "loss", "content": 0.02002848871052265, "timestamp": "2025-09-04 04:08:17.715659", "step": 2555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:17.829153", "step": 2555, "epoch": 2 }, { "type": "loss", "content": 0.015101251192390919, "timestamp": "2025-09-04 04:08:17.847046", "step": 2556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:08:17.937680", "step": 2556, "epoch": 2 }, { "type": "loss", "content": 0.001839878037571907, "timestamp": "2025-09-04 04:08:17.956055", "step": 2557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:08:18.033760", "step": 2557, "epoch": 2 }, { "type": "loss", "content": 0.0144809540361166, "timestamp": "2025-09-04 04:08:18.047547", "step": 2558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:18.152111", "step": 2558, "epoch": 2 }, { "type": "loss", "content": 0.002240030327811837, "timestamp": "2025-09-04 04:08:18.171263", "step": 2559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:18.273966", "step": 2559, "epoch": 2 }, { "type": "loss", "content": 0.006072205025702715, "timestamp": "2025-09-04 04:08:18.293971", "step": 2560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:26.673676", "step": 2560, "epoch": 2 }, { "type": "pplx", "content": 321.77623584602946, "timestamp": "2025-09-04 04:08:26.675704", "step": 2560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2560", "timestamp": "2025-09-04 04:08:27.019902", "step": 2560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:08:27.100761", "step": 2560, "epoch": 2 }, { "type": "loss", "content": 0.025917798280715942, "timestamp": "2025-09-04 04:08:27.117323", "step": 2561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:27.225582", "step": 2561, "epoch": 2 }, { "type": "loss", "content": 0.025179943069815636, "timestamp": "2025-09-04 04:08:27.245639", "step": 2562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:27.347057", "step": 2562, "epoch": 2 }, { "type": "loss", "content": 0.014063333161175251, "timestamp": "2025-09-04 04:08:27.366276", "step": 2563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:27.460070", "step": 2563, "epoch": 2 }, { "type": "loss", "content": 0.0029370649717748165, "timestamp": "2025-09-04 04:08:27.477708", "step": 2564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:27.582741", "step": 2564, "epoch": 2 }, { "type": "loss", "content": 0.014166107401251793, "timestamp": "2025-09-04 04:08:27.604754", "step": 2565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:08:27.715409", "step": 2565, "epoch": 2 }, { "type": "loss", "content": 0.06013331934809685, "timestamp": "2025-09-04 04:08:27.736200", "step": 2566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:08:27.881746", "step": 2566, "epoch": 2 }, { "type": "loss", "content": 0.008028823882341385, "timestamp": "2025-09-04 04:08:27.909876", "step": 2567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:28.010271", "step": 2567, "epoch": 2 }, { "type": "loss", "content": 0.04439549893140793, "timestamp": "2025-09-04 04:08:28.029939", "step": 2568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:28.120319", "step": 2568, "epoch": 2 }, { "type": "loss", "content": 0.001983765745535493, "timestamp": "2025-09-04 04:08:28.139120", "step": 2569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:08:28.222574", "step": 2569, "epoch": 2 }, { "type": "loss", "content": 0.13214930891990662, "timestamp": "2025-09-04 04:08:28.237917", "step": 2570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:28.341573", "step": 2570, "epoch": 2 }, { "type": "loss", "content": 0.02340877056121826, "timestamp": "2025-09-04 04:08:28.360697", "step": 2571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:08:28.437650", "step": 2571, "epoch": 2 }, { "type": "loss", "content": 0.03975323215126991, "timestamp": "2025-09-04 04:08:28.452202", "step": 2572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:08:28.534148", "step": 2572, "epoch": 2 }, { "type": "loss", "content": 0.013518854975700378, "timestamp": "2025-09-04 04:08:28.551256", "step": 2573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:28.660249", "step": 2573, "epoch": 2 }, { "type": "loss", "content": 0.039588313549757004, "timestamp": "2025-09-04 04:08:28.680318", "step": 2574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:08:28.788512", "step": 2574, "epoch": 2 }, { "type": "loss", "content": 0.002232232363894582, "timestamp": "2025-09-04 04:08:28.809195", "step": 2575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:28.901049", "step": 2575, "epoch": 2 }, { "type": "loss", "content": 0.015773437917232513, "timestamp": "2025-09-04 04:08:28.919005", "step": 2576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:29.009843", "step": 2576, "epoch": 2 }, { "type": "loss", "content": 0.01339112501591444, "timestamp": "2025-09-04 04:08:29.028845", "step": 2577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:29.130668", "step": 2577, "epoch": 2 }, { "type": "loss", "content": 0.016942497342824936, "timestamp": "2025-09-04 04:08:29.149712", "step": 2578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:08:29.223347", "step": 2578, "epoch": 2 }, { "type": "loss", "content": 0.024486076086759567, "timestamp": "2025-09-04 04:08:29.237004", "step": 2579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:08:29.320887", "step": 2579, "epoch": 2 }, { "type": "loss", "content": 0.03973688185214996, "timestamp": "2025-09-04 04:08:29.337239", "step": 2580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:37.715930", "step": 2580, "epoch": 2 }, { "type": "pplx", "content": 317.32499137431523, "timestamp": "2025-09-04 04:08:37.718369", "step": 2580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:37.815580", "step": 2580, "epoch": 2 }, { "type": "loss", "content": 0.004058394581079483, "timestamp": "2025-09-04 04:08:37.836637", "step": 2581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:08:37.923125", "step": 2581, "epoch": 2 }, { "type": "loss", "content": 0.006692761089652777, "timestamp": "2025-09-04 04:08:37.938509", "step": 2582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:38.034616", "step": 2582, "epoch": 2 }, { "type": "loss", "content": 0.01003364846110344, "timestamp": "2025-09-04 04:08:38.052107", "step": 2583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:38.147127", "step": 2583, "epoch": 2 }, { "type": "loss", "content": 0.0015831181081011891, "timestamp": "2025-09-04 04:08:38.165025", "step": 2584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:38.257632", "step": 2584, "epoch": 2 }, { "type": "loss", "content": 0.02085372433066368, "timestamp": "2025-09-04 04:08:38.276733", "step": 2585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:38.379441", "step": 2585, "epoch": 2 }, { "type": "loss", "content": 0.012194113805890083, "timestamp": "2025-09-04 04:08:38.398062", "step": 2586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:38.497361", "step": 2586, "epoch": 2 }, { "type": "loss", "content": 0.020145049318671227, "timestamp": "2025-09-04 04:08:38.515933", "step": 2587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:38.611335", "step": 2587, "epoch": 2 }, { "type": "loss", "content": 0.01815951056778431, "timestamp": "2025-09-04 04:08:38.629608", "step": 2588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:08:38.737350", "step": 2588, "epoch": 2 }, { "type": "loss", "content": 0.0061141857877373695, "timestamp": "2025-09-04 04:08:38.760046", "step": 2589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:08:38.843486", "step": 2589, "epoch": 2 }, { "type": "loss", "content": 0.04689551517367363, "timestamp": "2025-09-04 04:08:38.858515", "step": 2590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:38.962843", "step": 2590, "epoch": 2 }, { "type": "loss", "content": 0.005039518233388662, "timestamp": "2025-09-04 04:08:38.982095", "step": 2591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:39.083944", "step": 2591, "epoch": 2 }, { "type": "loss", "content": 0.003636001143604517, "timestamp": "2025-09-04 04:08:39.103553", "step": 2592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:39.194471", "step": 2592, "epoch": 2 }, { "type": "loss", "content": 0.0033800839446485043, "timestamp": "2025-09-04 04:08:39.213259", "step": 2593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:08:39.323734", "step": 2593, "epoch": 2 }, { "type": "loss", "content": 0.01337014976888895, "timestamp": "2025-09-04 04:08:39.344344", "step": 2594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:39.445586", "step": 2594, "epoch": 2 }, { "type": "loss", "content": 0.003958144225180149, "timestamp": "2025-09-04 04:08:39.464458", "step": 2595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 04:08:39.598304", "step": 2595, "epoch": 2 }, { "type": "loss", "content": 0.004944264888763428, "timestamp": "2025-09-04 04:08:39.623718", "step": 2596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:39.715344", "step": 2596, "epoch": 2 }, { "type": "loss", "content": 0.012598078697919846, "timestamp": "2025-09-04 04:08:39.734053", "step": 2597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:08:39.811419", "step": 2597, "epoch": 2 }, { "type": "loss", "content": 0.010607503354549408, "timestamp": "2025-09-04 04:08:39.825419", "step": 2598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:08:39.918935", "step": 2598, "epoch": 2 }, { "type": "loss", "content": 0.05367982015013695, "timestamp": "2025-09-04 04:08:39.936039", "step": 2599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:08:40.045928", "step": 2599, "epoch": 2 }, { "type": "loss", "content": 0.0185086727142334, "timestamp": "2025-09-04 04:08:40.067205", "step": 2600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:48.439146", "step": 2600, "epoch": 2 }, { "type": "pplx", "content": 314.9293203900891, "timestamp": "2025-09-04 04:08:48.441562", "step": 2600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2600", "timestamp": "2025-09-04 04:08:48.943661", "step": 2600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:08:49.017512", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.002318183658644557, "timestamp": "2025-09-04 04:08:49.032219", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:49.134643", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.02105123922228813, "timestamp": "2025-09-04 04:08:49.153886", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:08:49.255669", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.02376515232026577, "timestamp": "2025-09-04 04:08:49.274554", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:08:49.378640", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.027340862900018692, "timestamp": "2025-09-04 04:08:49.398723", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:49.490403", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.007925122044980526, "timestamp": "2025-09-04 04:08:49.509557", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:49.603779", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.006013544742017984, "timestamp": "2025-09-04 04:08:49.621155", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:49.733300", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.011563356965780258, "timestamp": "2025-09-04 04:08:49.753611", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:08:49.856580", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.006092959549278021, "timestamp": "2025-09-04 04:08:49.876556", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:08:49.980849", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.01699855737388134, "timestamp": "2025-09-04 04:08:50.002771", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:08:50.097356", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.002047803020104766, "timestamp": "2025-09-04 04:08:50.114758", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:08:50.187652", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.028143590316176414, "timestamp": "2025-09-04 04:08:50.200565", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:50.295727", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.0071517350152134895, "timestamp": "2025-09-04 04:08:50.313984", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:08:50.410801", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.01916099339723587, "timestamp": "2025-09-04 04:08:50.431216", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:08:50.518259", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.006362794432789087, "timestamp": "2025-09-04 04:08:50.533877", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:50.643528", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.006359405815601349, "timestamp": "2025-09-04 04:08:50.663843", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:08:50.750539", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.0024103245232254267, "timestamp": "2025-09-04 04:08:50.766946", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:08:50.859755", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.007830057293176651, "timestamp": "2025-09-04 04:08:50.878988", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:08:50.956543", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.05507553741335869, "timestamp": "2025-09-04 04:08:50.970499", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:08:51.042830", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.007564071100205183, "timestamp": "2025-09-04 04:08:51.055769", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:08:51.165873", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 0.0021915908437222242, "timestamp": "2025-09-04 04:08:51.187298", "step": 2620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:08:59.590829", "step": 2620, "epoch": 3 }, { "type": "pplx", "content": 315.285162165603, "timestamp": "2025-09-04 04:08:59.592803", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:08:59.696649", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 0.0060343146324157715, "timestamp": "2025-09-04 04:08:59.718904", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1392 ], "flops": 27840169073088.0 }, "timestamp": "2025-09-04 04:08:59.923521", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.0053197117522358894, "timestamp": "2025-09-04 04:08:59.963010", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:09:00.073876", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.020664365962147713, "timestamp": "2025-09-04 04:09:00.094500", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 04:09:00.309867", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 0.015484297648072243, "timestamp": "2025-09-04 04:09:00.351553", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:00.452502", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.002410691697150469, "timestamp": "2025-09-04 04:09:00.473673", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:00.579481", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.019456349313259125, "timestamp": "2025-09-04 04:09:00.599522", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:00.690498", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.008297095075249672, "timestamp": "2025-09-04 04:09:00.707257", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:00.807017", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.003302738768979907, "timestamp": "2025-09-04 04:09:00.826356", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:00.932212", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.11121071875095367, "timestamp": "2025-09-04 04:09:00.954440", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:09:01.068831", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.02678021229803562, "timestamp": "2025-09-04 04:09:01.089309", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:01.195192", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.00880197249352932, "timestamp": "2025-09-04 04:09:01.214228", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:01.308951", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.011775809340178967, "timestamp": "2025-09-04 04:09:01.326840", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:09:01.400378", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.010084496811032295, "timestamp": "2025-09-04 04:09:01.415169", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:01.504764", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.001346323057077825, "timestamp": "2025-09-04 04:09:01.521617", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:01.623955", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.016563931480050087, "timestamp": "2025-09-04 04:09:01.643140", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:01.739409", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.002511340891942382, "timestamp": "2025-09-04 04:09:01.757733", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:01.863630", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.1092916801571846, "timestamp": "2025-09-04 04:09:01.886104", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:01.995722", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.016762439161539078, "timestamp": "2025-09-04 04:09:02.016030", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:09:02.099780", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.0016962930094450712, "timestamp": "2025-09-04 04:09:02.115114", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:09:02.191928", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.01071829255670309, "timestamp": "2025-09-04 04:09:02.206841", "step": 2640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:09:10.584308", "step": 2640, "epoch": 3 }, { "type": "pplx", "content": 312.75249298706785, "timestamp": "2025-09-04 04:09:10.586294", "step": 2640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2640", "timestamp": "2025-09-04 04:09:10.941242", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:09:11.023411", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.014106813818216324, "timestamp": "2025-09-04 04:09:11.039997", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:09:11.139999", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.005073550622910261, "timestamp": "2025-09-04 04:09:11.158856", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:11.265331", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.013524402864277363, "timestamp": "2025-09-04 04:09:11.285345", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:11.368162", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.009907763451337814, "timestamp": "2025-09-04 04:09:11.384138", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:09:11.457840", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.002835202729329467, "timestamp": "2025-09-04 04:09:11.472809", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:11.581086", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.0016281316056847572, "timestamp": "2025-09-04 04:09:11.601399", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:11.704336", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.0024171601980924606, "timestamp": "2025-09-04 04:09:11.723618", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:11.819889", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.002636190503835678, "timestamp": "2025-09-04 04:09:11.837434", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:11.934303", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.00617353105917573, "timestamp": "2025-09-04 04:09:11.954720", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:12.047604", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.0027180935721844435, "timestamp": "2025-09-04 04:09:12.064695", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:12.166834", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.004888160619884729, "timestamp": "2025-09-04 04:09:12.186041", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:12.269214", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.00197249511256814, "timestamp": "2025-09-04 04:09:12.285048", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:12.388267", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.0006813482032157481, "timestamp": "2025-09-04 04:09:12.410251", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:12.505553", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.0030994487460702658, "timestamp": "2025-09-04 04:09:12.523054", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:12.612906", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.002902757376432419, "timestamp": "2025-09-04 04:09:12.629822", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:12.707648", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.0018415531376376748, "timestamp": "2025-09-04 04:09:12.722595", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:09:12.820427", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.015612171031534672, "timestamp": "2025-09-04 04:09:12.841224", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:12.941218", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.042000770568847656, "timestamp": "2025-09-04 04:09:12.959702", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:13.054817", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.0024185827933251858, "timestamp": "2025-09-04 04:09:13.072416", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:13.178902", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.02803855948150158, "timestamp": "2025-09-04 04:09:13.199805", "step": 2660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:09:21.711234", "step": 2660, "epoch": 3 }, { "type": "pplx", "content": 310.74890931755016, "timestamp": "2025-09-04 04:09:21.713177", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:21.805471", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.0010291390353813767, "timestamp": "2025-09-04 04:09:21.824559", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:09:21.934579", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.005789092276245356, "timestamp": "2025-09-04 04:09:21.955070", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:22.046012", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.015148711390793324, "timestamp": "2025-09-04 04:09:22.062794", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 04:09:22.282186", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.004448336083441973, "timestamp": "2025-09-04 04:09:22.325169", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:09:22.417654", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.0008569728815928102, "timestamp": "2025-09-04 04:09:22.436738", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:22.545585", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.034380171447992325, "timestamp": "2025-09-04 04:09:22.565817", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 04:09:22.695282", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.020487092435359955, "timestamp": "2025-09-04 04:09:22.718863", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:22.824617", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.004988936707377434, "timestamp": "2025-09-04 04:09:22.845347", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:09:22.928065", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.0038620613049715757, "timestamp": "2025-09-04 04:09:22.944722", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:23.048743", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.018743831664323807, "timestamp": "2025-09-04 04:09:23.068031", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:23.146345", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.044233791530132294, "timestamp": "2025-09-04 04:09:23.160449", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:23.268352", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.004456370137631893, "timestamp": "2025-09-04 04:09:23.289464", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:23.386870", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.08841562271118164, "timestamp": "2025-09-04 04:09:23.407241", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:09:23.518418", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.005260918755084276, "timestamp": "2025-09-04 04:09:23.539009", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:23.645223", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.009504856541752815, "timestamp": "2025-09-04 04:09:23.665320", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:09:23.749594", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.0136948861181736, "timestamp": "2025-09-04 04:09:23.765576", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:23.847090", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.00021050203940831125, "timestamp": "2025-09-04 04:09:23.863709", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:23.963423", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.007187894079834223, "timestamp": "2025-09-04 04:09:23.981938", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:09:24.093209", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.02946031652390957, "timestamp": "2025-09-04 04:09:24.113901", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:24.220814", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.004950105212628841, "timestamp": "2025-09-04 04:09:24.241589", "step": 2680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:09:32.746359", "step": 2680, "epoch": 3 }, { "type": "pplx", "content": 310.73999203780255, "timestamp": "2025-09-04 04:09:32.748816", "step": 2680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2680", "timestamp": "2025-09-04 04:09:33.104178", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:33.210287", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.02347641810774803, "timestamp": "2025-09-04 04:09:33.232593", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:33.327629", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.019025633111596107, "timestamp": "2025-09-04 04:09:33.345199", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:33.447778", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.0245287474244833, "timestamp": "2025-09-04 04:09:33.467127", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:33.575344", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.04559353366494179, "timestamp": "2025-09-04 04:09:33.596103", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:33.689138", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 0.08233796805143356, "timestamp": "2025-09-04 04:09:33.708433", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:09:33.795837", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.0036527463234961033, "timestamp": "2025-09-04 04:09:33.811533", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:33.915424", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.016142617911100388, "timestamp": "2025-09-04 04:09:33.934707", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:34.014124", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.029892858117818832, "timestamp": "2025-09-04 04:09:34.029190", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:09:34.099577", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 0.009370243176817894, "timestamp": "2025-09-04 04:09:34.113742", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:34.192195", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.017486093565821648, "timestamp": "2025-09-04 04:09:34.206344", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:34.309269", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.02338462322950363, "timestamp": "2025-09-04 04:09:34.328488", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:09:34.417191", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.048346079885959625, "timestamp": "2025-09-04 04:09:34.433483", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:34.524648", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.017204314470291138, "timestamp": "2025-09-04 04:09:34.543518", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:09:34.666403", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.00045050657354295254, "timestamp": "2025-09-04 04:09:34.689538", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:34.774209", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.01556316763162613, "timestamp": "2025-09-04 04:09:34.789446", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:34.888274", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.04271606728434563, "timestamp": "2025-09-04 04:09:34.907744", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:09:35.006125", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.008685296401381493, "timestamp": "2025-09-04 04:09:35.026917", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:09:35.133281", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.0019303993321955204, "timestamp": "2025-09-04 04:09:35.153384", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:35.252519", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.01800696924328804, "timestamp": "2025-09-04 04:09:35.271186", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:35.365532", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.01136024296283722, "timestamp": "2025-09-04 04:09:35.383767", "step": 2700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:09:43.928592", "step": 2700, "epoch": 3 }, { "type": "pplx", "content": 313.122110339322, "timestamp": "2025-09-04 04:09:43.930940", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 04:09:44.064080", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.011454450897872448, "timestamp": "2025-09-04 04:09:44.092892", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:44.185228", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 0.031085196882486343, "timestamp": "2025-09-04 04:09:44.201909", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:09:44.304243", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.006320777349174023, "timestamp": "2025-09-04 04:09:44.322922", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:44.427439", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.012303713709115982, "timestamp": "2025-09-04 04:09:44.447172", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:09:44.531985", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.004898655693978071, "timestamp": "2025-09-04 04:09:44.548628", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:09:44.662833", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.01601606048643589, "timestamp": "2025-09-04 04:09:44.681951", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:09:44.767300", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.011142881587147713, "timestamp": "2025-09-04 04:09:44.782310", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:09:44.901063", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.01005838718265295, "timestamp": "2025-09-04 04:09:44.923827", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:45.024411", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.05936342850327492, "timestamp": "2025-09-04 04:09:45.044274", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:45.156582", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.008189283311367035, "timestamp": "2025-09-04 04:09:45.173064", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:09:45.285504", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 0.10139759629964828, "timestamp": "2025-09-04 04:09:45.305622", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:45.407014", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 0.023838041350245476, "timestamp": "2025-09-04 04:09:45.426287", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:45.519681", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.02069295570254326, "timestamp": "2025-09-04 04:09:45.538280", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:09:45.616632", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.07725197076797485, "timestamp": "2025-09-04 04:09:45.630199", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:45.725539", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.006542861927300692, "timestamp": "2025-09-04 04:09:45.742578", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:45.847195", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.03040933422744274, "timestamp": "2025-09-04 04:09:45.867057", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:09:45.961569", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.0011650609085336328, "timestamp": "2025-09-04 04:09:45.980564", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:46.091204", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 0.020016556605696678, "timestamp": "2025-09-04 04:09:46.111056", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:46.213969", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.0162852443754673, "timestamp": "2025-09-04 04:09:46.231413", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:09:46.352029", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.0035490740556269884, "timestamp": "2025-09-04 04:09:46.374802", "step": 2720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:09:54.902522", "step": 2720, "epoch": 3 }, { "type": "pplx", "content": 314.5356204228338, "timestamp": "2025-09-04 04:09:54.905530", "step": 2720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2720", "timestamp": "2025-09-04 04:09:55.409233", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 04:09:55.588923", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.0011699828319251537, "timestamp": "2025-09-04 04:09:55.626999", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:55.710243", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 0.008469424210488796, "timestamp": "2025-09-04 04:09:55.725330", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:09:55.843055", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.005703017581254244, "timestamp": "2025-09-04 04:09:55.865254", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:09:55.940919", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.06145107373595238, "timestamp": "2025-09-04 04:09:55.955263", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:56.031260", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.021033862605690956, "timestamp": "2025-09-04 04:09:56.046734", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:09:56.138352", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.009126712568104267, "timestamp": "2025-09-04 04:09:56.155147", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:09:56.272076", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.009277377277612686, "timestamp": "2025-09-04 04:09:56.292646", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:09:56.386013", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.0015412485226988792, "timestamp": "2025-09-04 04:09:56.404221", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:09:56.499794", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.02090812474489212, "timestamp": "2025-09-04 04:09:56.520175", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:09:56.605490", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.010399392805993557, "timestamp": "2025-09-04 04:09:56.620588", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:09:56.730598", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.0056745195761322975, "timestamp": "2025-09-04 04:09:56.750912", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:09:56.844525", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.13878655433654785, "timestamp": "2025-09-04 04:09:56.862397", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:09:56.938712", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.016388939693570137, "timestamp": "2025-09-04 04:09:56.954158", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:09:57.053909", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.011486927047371864, "timestamp": "2025-09-04 04:09:57.072709", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:09:57.189227", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.007860065437853336, "timestamp": "2025-09-04 04:09:57.211366", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:09:57.322583", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.013591518625617027, "timestamp": "2025-09-04 04:09:57.343880", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:57.445097", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.002234839601442218, "timestamp": "2025-09-04 04:09:57.466151", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:57.568464", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.05005452781915665, "timestamp": "2025-09-04 04:09:57.585998", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:09:57.689780", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.02009434998035431, "timestamp": "2025-09-04 04:09:57.708905", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:09:57.804873", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.021923096850514412, "timestamp": "2025-09-04 04:09:57.823090", "step": 2740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:10:06.199426", "step": 2740, "epoch": 3 }, { "type": "pplx", "content": 311.12367018379337, "timestamp": "2025-09-04 04:10:06.201852", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:06.299389", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.017065318301320076, "timestamp": "2025-09-04 04:10:06.320394", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:06.429479", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.02739621140062809, "timestamp": "2025-09-04 04:10:06.449742", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:06.557790", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.0004024782683700323, "timestamp": "2025-09-04 04:10:06.578034", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:06.684752", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.03272821381688118, "timestamp": "2025-09-04 04:10:06.705438", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 04:10:06.833286", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.036441512405872345, "timestamp": "2025-09-04 04:10:06.860279", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:06.961146", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.017346149310469627, "timestamp": "2025-09-04 04:10:06.979983", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:07.086318", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.008722832426428795, "timestamp": "2025-09-04 04:10:07.106265", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:07.216471", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.004638466984033585, "timestamp": "2025-09-04 04:10:07.237228", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:10:07.320538", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.0057872445322573185, "timestamp": "2025-09-04 04:10:07.337391", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:10:07.448267", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.0026939285453408957, "timestamp": "2025-09-04 04:10:07.468552", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:07.570874", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.0036715560127049685, "timestamp": "2025-09-04 04:10:07.589996", "step": 2751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:10:07.671514", "step": 2751, "epoch": 3 }, { "type": "loss", "content": 0.0037178792990744114, "timestamp": "2025-09-04 04:10:07.686499", "step": 2752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:10:07.777656", "step": 2752, "epoch": 3 }, { "type": "loss", "content": 0.0027517317794263363, "timestamp": "2025-09-04 04:10:07.796789", "step": 2753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:07.897199", "step": 2753, "epoch": 3 }, { "type": "loss", "content": 0.0021229060366749763, "timestamp": "2025-09-04 04:10:07.916168", "step": 2754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:10:07.994227", "step": 2754, "epoch": 3 }, { "type": "loss", "content": 0.006144124083220959, "timestamp": "2025-09-04 04:10:08.008433", "step": 2755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:08.109566", "step": 2755, "epoch": 3 }, { "type": "loss", "content": 0.0041644214652478695, "timestamp": "2025-09-04 04:10:08.129094", "step": 2756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:10:08.239640", "step": 2756, "epoch": 3 }, { "type": "loss", "content": 0.018796509131789207, "timestamp": "2025-09-04 04:10:08.262060", "step": 2757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:10:08.379321", "step": 2757, "epoch": 3 }, { "type": "loss", "content": 0.009927745908498764, "timestamp": "2025-09-04 04:10:08.401456", "step": 2758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:10:08.511886", "step": 2758, "epoch": 3 }, { "type": "loss", "content": 0.0052697621285915375, "timestamp": "2025-09-04 04:10:08.532136", "step": 2759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:08.642566", "step": 2759, "epoch": 3 }, { "type": "loss", "content": 0.005806296598166227, "timestamp": "2025-09-04 04:10:08.663642", "step": 2760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:10:17.026885", "step": 2760, "epoch": 3 }, { "type": "pplx", "content": 312.6903928437137, "timestamp": "2025-09-04 04:10:17.028993", "step": 2760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2760", "timestamp": "2025-09-04 04:10:17.510313", "step": 2760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:10:17.606536", "step": 2760, "epoch": 3 }, { "type": "loss", "content": 0.03818826749920845, "timestamp": "2025-09-04 04:10:17.626870", "step": 2761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:10:17.708673", "step": 2761, "epoch": 3 }, { "type": "loss", "content": 0.004727974068373442, "timestamp": "2025-09-04 04:10:17.723698", "step": 2762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:17.824531", "step": 2762, "epoch": 3 }, { "type": "loss", "content": 0.001716333907097578, "timestamp": "2025-09-04 04:10:17.843370", "step": 2763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:10:17.929149", "step": 2763, "epoch": 3 }, { "type": "loss", "content": 0.01537768542766571, "timestamp": "2025-09-04 04:10:17.945052", "step": 2764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:18.039295", "step": 2764, "epoch": 3 }, { "type": "loss", "content": 0.003431823570281267, "timestamp": "2025-09-04 04:10:18.058557", "step": 2765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:10:18.152151", "step": 2765, "epoch": 3 }, { "type": "loss", "content": 0.01357530988752842, "timestamp": "2025-09-04 04:10:18.168950", "step": 2766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:10:18.263993", "step": 2766, "epoch": 3 }, { "type": "loss", "content": 0.009554415941238403, "timestamp": "2025-09-04 04:10:18.281261", "step": 2767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:18.386404", "step": 2767, "epoch": 3 }, { "type": "loss", "content": 0.04511303827166557, "timestamp": "2025-09-04 04:10:18.406572", "step": 2768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:18.510539", "step": 2768, "epoch": 3 }, { "type": "loss", "content": 0.00565410265699029, "timestamp": "2025-09-04 04:10:18.532370", "step": 2769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:18.627990", "step": 2769, "epoch": 3 }, { "type": "loss", "content": 0.01116864662617445, "timestamp": "2025-09-04 04:10:18.645617", "step": 2770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:10:18.717805", "step": 2770, "epoch": 3 }, { "type": "loss", "content": 0.024722347036004066, "timestamp": "2025-09-04 04:10:18.730823", "step": 2771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:18.831877", "step": 2771, "epoch": 3 }, { "type": "loss", "content": 0.028597375378012657, "timestamp": "2025-09-04 04:10:18.851318", "step": 2772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:18.929289", "step": 2772, "epoch": 3 }, { "type": "loss", "content": 0.0017174814129248261, "timestamp": "2025-09-04 04:10:18.944338", "step": 2773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:19.054046", "step": 2773, "epoch": 3 }, { "type": "loss", "content": 0.018303746357560158, "timestamp": "2025-09-04 04:10:19.074366", "step": 2774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:19.182654", "step": 2774, "epoch": 3 }, { "type": "loss", "content": 0.005763411987572908, "timestamp": "2025-09-04 04:10:19.202145", "step": 2775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:10:19.290334", "step": 2775, "epoch": 3 }, { "type": "loss", "content": 0.08446403592824936, "timestamp": "2025-09-04 04:10:19.306205", "step": 2776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:19.405065", "step": 2776, "epoch": 3 }, { "type": "loss", "content": 0.02213035710155964, "timestamp": "2025-09-04 04:10:19.425185", "step": 2777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:19.528569", "step": 2777, "epoch": 3 }, { "type": "loss", "content": 0.00788130797445774, "timestamp": "2025-09-04 04:10:19.547055", "step": 2778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:10:19.630529", "step": 2778, "epoch": 3 }, { "type": "loss", "content": 0.002031184732913971, "timestamp": "2025-09-04 04:10:19.645671", "step": 2779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:19.755409", "step": 2779, "epoch": 3 }, { "type": "loss", "content": 0.0032860611099749804, "timestamp": "2025-09-04 04:10:19.776604", "step": 2780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:10:28.279651", "step": 2780, "epoch": 3 }, { "type": "pplx", "content": 315.0957572527015, "timestamp": "2025-09-04 04:10:28.282924", "step": 2780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:28.358024", "step": 2780, "epoch": 3 }, { "type": "loss", "content": 0.020035451278090477, "timestamp": "2025-09-04 04:10:28.373018", "step": 2781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:28.481511", "step": 2781, "epoch": 3 }, { "type": "loss", "content": 0.019282890483736992, "timestamp": "2025-09-04 04:10:28.501835", "step": 2782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:28.605645", "step": 2782, "epoch": 3 }, { "type": "loss", "content": 0.0053369090892374516, "timestamp": "2025-09-04 04:10:28.624855", "step": 2783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:28.702196", "step": 2783, "epoch": 3 }, { "type": "loss", "content": 0.003618256188929081, "timestamp": "2025-09-04 04:10:28.717006", "step": 2784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:10:28.833789", "step": 2784, "epoch": 3 }, { "type": "loss", "content": 0.03532740846276283, "timestamp": "2025-09-04 04:10:28.857626", "step": 2785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:10:28.957073", "step": 2785, "epoch": 3 }, { "type": "loss", "content": 0.014763697050511837, "timestamp": "2025-09-04 04:10:28.975683", "step": 2786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:29.082579", "step": 2786, "epoch": 3 }, { "type": "loss", "content": 0.05285456404089928, "timestamp": "2025-09-04 04:10:29.102524", "step": 2787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:29.209574", "step": 2787, "epoch": 3 }, { "type": "loss", "content": 0.01331684272736311, "timestamp": "2025-09-04 04:10:29.230075", "step": 2788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:29.306219", "step": 2788, "epoch": 3 }, { "type": "loss", "content": 0.03862687200307846, "timestamp": "2025-09-04 04:10:29.321566", "step": 2789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:10:29.405360", "step": 2789, "epoch": 3 }, { "type": "loss", "content": 0.0027003700379282236, "timestamp": "2025-09-04 04:10:29.420505", "step": 2790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:10:29.513961", "step": 2790, "epoch": 3 }, { "type": "loss", "content": 0.024312736466526985, "timestamp": "2025-09-04 04:10:29.531382", "step": 2791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:10:29.641310", "step": 2791, "epoch": 3 }, { "type": "loss", "content": 0.009408136829733849, "timestamp": "2025-09-04 04:10:29.662400", "step": 2792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:10:29.770535", "step": 2792, "epoch": 3 }, { "type": "loss", "content": 0.008958464488387108, "timestamp": "2025-09-04 04:10:29.793068", "step": 2793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:29.900709", "step": 2793, "epoch": 3 }, { "type": "loss", "content": 0.0011589645873755217, "timestamp": "2025-09-04 04:10:29.920749", "step": 2794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:30.016347", "step": 2794, "epoch": 3 }, { "type": "loss", "content": 0.00341598829254508, "timestamp": "2025-09-04 04:10:30.033834", "step": 2795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:30.111178", "step": 2795, "epoch": 3 }, { "type": "loss", "content": 0.012302583083510399, "timestamp": "2025-09-04 04:10:30.125938", "step": 2796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:30.224337", "step": 2796, "epoch": 3 }, { "type": "loss", "content": 0.017612578347325325, "timestamp": "2025-09-04 04:10:30.245067", "step": 2797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:30.339629", "step": 2797, "epoch": 3 }, { "type": "loss", "content": 0.007235830184072256, "timestamp": "2025-09-04 04:10:30.357101", "step": 2798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:10:30.450764", "step": 2798, "epoch": 3 }, { "type": "loss", "content": 0.0061172679997980595, "timestamp": "2025-09-04 04:10:30.468141", "step": 2799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:10:30.579493", "step": 2799, "epoch": 3 }, { "type": "loss", "content": 0.001860892865806818, "timestamp": "2025-09-04 04:10:30.600698", "step": 2800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:10:39.011398", "step": 2800, "epoch": 3 }, { "type": "pplx", "content": 318.34111266257355, "timestamp": "2025-09-04 04:10:39.013794", "step": 2800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2800", "timestamp": "2025-09-04 04:10:39.393827", "step": 2800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:10:39.474968", "step": 2800, "epoch": 3 }, { "type": "loss", "content": 0.10843028128147125, "timestamp": "2025-09-04 04:10:39.491863", "step": 2801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:39.595028", "step": 2801, "epoch": 3 }, { "type": "loss", "content": 0.002566551323980093, "timestamp": "2025-09-04 04:10:39.614327", "step": 2802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:10:39.708227", "step": 2802, "epoch": 3 }, { "type": "loss", "content": 0.010621008463203907, "timestamp": "2025-09-04 04:10:39.725450", "step": 2803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:10:39.803350", "step": 2803, "epoch": 3 }, { "type": "loss", "content": 0.015502018854022026, "timestamp": "2025-09-04 04:10:39.818407", "step": 2804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:39.918618", "step": 2804, "epoch": 3 }, { "type": "loss", "content": 0.00420707743614912, "timestamp": "2025-09-04 04:10:39.939782", "step": 2805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:40.032227", "step": 2805, "epoch": 3 }, { "type": "loss", "content": 0.02263396605849266, "timestamp": "2025-09-04 04:10:40.046253", "step": 2806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:10:40.124234", "step": 2806, "epoch": 3 }, { "type": "loss", "content": 0.0052959551103413105, "timestamp": "2025-09-04 04:10:40.138380", "step": 2807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:40.242077", "step": 2807, "epoch": 3 }, { "type": "loss", "content": 0.015647169202566147, "timestamp": "2025-09-04 04:10:40.262075", "step": 2808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:40.369856", "step": 2808, "epoch": 3 }, { "type": "loss", "content": 0.0023905981797724962, "timestamp": "2025-09-04 04:10:40.392110", "step": 2809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:40.500448", "step": 2809, "epoch": 3 }, { "type": "loss", "content": 0.002249652286991477, "timestamp": "2025-09-04 04:10:40.520444", "step": 2810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:40.624725", "step": 2810, "epoch": 3 }, { "type": "loss", "content": 0.006803617812693119, "timestamp": "2025-09-04 04:10:40.644151", "step": 2811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:40.748833", "step": 2811, "epoch": 3 }, { "type": "loss", "content": 0.0021052875090390444, "timestamp": "2025-09-04 04:10:40.768933", "step": 2812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:40.869181", "step": 2812, "epoch": 3 }, { "type": "loss", "content": 0.004011242184787989, "timestamp": "2025-09-04 04:10:40.890221", "step": 2813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 04:10:41.095710", "step": 2813, "epoch": 3 }, { "type": "loss", "content": 0.0032255747355520725, "timestamp": "2025-09-04 04:10:41.134881", "step": 2814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:41.212690", "step": 2814, "epoch": 3 }, { "type": "loss", "content": 0.0025714444927871227, "timestamp": "2025-09-04 04:10:41.226621", "step": 2815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:41.327667", "step": 2815, "epoch": 3 }, { "type": "loss", "content": 0.013107407838106155, "timestamp": "2025-09-04 04:10:41.347348", "step": 2816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:10:41.444093", "step": 2816, "epoch": 3 }, { "type": "loss", "content": 0.0025292744394391775, "timestamp": "2025-09-04 04:10:41.464580", "step": 2817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:41.566996", "step": 2817, "epoch": 3 }, { "type": "loss", "content": 0.019604351371526718, "timestamp": "2025-09-04 04:10:41.586193", "step": 2818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:10:41.685731", "step": 2818, "epoch": 3 }, { "type": "loss", "content": 0.006207880564033985, "timestamp": "2025-09-04 04:10:41.704494", "step": 2819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:10:41.804185", "step": 2819, "epoch": 3 }, { "type": "loss", "content": 0.0022927229292690754, "timestamp": "2025-09-04 04:10:41.823554", "step": 2820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:10:50.340911", "step": 2820, "epoch": 3 }, { "type": "pplx", "content": 319.52629439814444, "timestamp": "2025-09-04 04:10:50.346919", "step": 2820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:50.421114", "step": 2820, "epoch": 3 }, { "type": "loss", "content": 0.0014415581244975328, "timestamp": "2025-09-04 04:10:50.436450", "step": 2821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:50.513386", "step": 2821, "epoch": 3 }, { "type": "loss", "content": 0.039860036224126816, "timestamp": "2025-09-04 04:10:50.527474", "step": 2822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:10:50.638203", "step": 2822, "epoch": 3 }, { "type": "loss", "content": 0.04684140905737877, "timestamp": "2025-09-04 04:10:50.658808", "step": 2823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:10:50.751914", "step": 2823, "epoch": 3 }, { "type": "loss", "content": 0.005943602416664362, "timestamp": "2025-09-04 04:10:50.769856", "step": 2824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:50.868293", "step": 2824, "epoch": 3 }, { "type": "loss", "content": 0.006049733608961105, "timestamp": "2025-09-04 04:10:50.888798", "step": 2825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:50.988731", "step": 2825, "epoch": 3 }, { "type": "loss", "content": 0.0014789201086387038, "timestamp": "2025-09-04 04:10:51.007670", "step": 2826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 04:10:51.233431", "step": 2826, "epoch": 3 }, { "type": "loss", "content": 0.029814256355166435, "timestamp": "2025-09-04 04:10:51.275647", "step": 2827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:10:51.380010", "step": 2827, "epoch": 3 }, { "type": "loss", "content": 0.004776179324835539, "timestamp": "2025-09-04 04:10:51.400069", "step": 2828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:10:51.476052", "step": 2828, "epoch": 3 }, { "type": "loss", "content": 0.025096198543906212, "timestamp": "2025-09-04 04:10:51.491170", "step": 2829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1200 ], "flops": 24000145761984.0 }, "timestamp": "2025-09-04 04:10:51.666720", "step": 2829, "epoch": 3 }, { "type": "loss", "content": 0.0008539275149814785, "timestamp": "2025-09-04 04:10:51.699819", "step": 2830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:51.794605", "step": 2830, "epoch": 3 }, { "type": "loss", "content": 0.039075467735528946, "timestamp": "2025-09-04 04:10:51.812176", "step": 2831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:10:51.927654", "step": 2831, "epoch": 3 }, { "type": "loss", "content": 0.002711005974560976, "timestamp": "2025-09-04 04:10:51.948514", "step": 2832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:10:52.054122", "step": 2832, "epoch": 3 }, { "type": "loss", "content": 0.01790589839220047, "timestamp": "2025-09-04 04:10:52.076419", "step": 2833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:52.177333", "step": 2833, "epoch": 3 }, { "type": "loss", "content": 0.06275072693824768, "timestamp": "2025-09-04 04:10:52.196207", "step": 2834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:52.300946", "step": 2834, "epoch": 3 }, { "type": "loss", "content": 0.004898981656879187, "timestamp": "2025-09-04 04:10:52.320225", "step": 2835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:10:52.532531", "step": 2835, "epoch": 3 }, { "type": "loss", "content": 0.009175102226436138, "timestamp": "2025-09-04 04:10:52.572401", "step": 2836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:10:52.665860", "step": 2836, "epoch": 3 }, { "type": "loss", "content": 0.009001928381621838, "timestamp": "2025-09-04 04:10:52.685175", "step": 2837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:10:52.768630", "step": 2837, "epoch": 3 }, { "type": "loss", "content": 0.0046532354317605495, "timestamp": "2025-09-04 04:10:52.783757", "step": 2838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:10:52.887118", "step": 2838, "epoch": 3 }, { "type": "loss", "content": 0.010329126380383968, "timestamp": "2025-09-04 04:10:52.906475", "step": 2839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:10:53.005841", "step": 2839, "epoch": 3 }, { "type": "loss", "content": 0.010225264355540276, "timestamp": "2025-09-04 04:10:53.025440", "step": 2840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:01.520893", "step": 2840, "epoch": 3 }, { "type": "pplx", "content": 325.51343438767867, "timestamp": "2025-09-04 04:11:01.522859", "step": 2840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2840", "timestamp": "2025-09-04 04:11:01.898939", "step": 2840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:02.001221", "step": 2840, "epoch": 3 }, { "type": "loss", "content": 0.01796550862491131, "timestamp": "2025-09-04 04:11:02.022491", "step": 2841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:11:02.125380", "step": 2841, "epoch": 3 }, { "type": "loss", "content": 0.005674062762409449, "timestamp": "2025-09-04 04:11:02.144648", "step": 2842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:11:02.266925", "step": 2842, "epoch": 3 }, { "type": "loss", "content": 0.01859263703227043, "timestamp": "2025-09-04 04:11:02.290243", "step": 2843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:02.397530", "step": 2843, "epoch": 3 }, { "type": "loss", "content": 0.0015665870159864426, "timestamp": "2025-09-04 04:11:02.418251", "step": 2844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:11:02.527583", "step": 2844, "epoch": 3 }, { "type": "loss", "content": 0.012087870389223099, "timestamp": "2025-09-04 04:11:02.550254", "step": 2845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:02.659541", "step": 2845, "epoch": 3 }, { "type": "loss", "content": 0.020001424476504326, "timestamp": "2025-09-04 04:11:02.679920", "step": 2846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:11:02.756649", "step": 2846, "epoch": 3 }, { "type": "loss", "content": 0.028880124911665916, "timestamp": "2025-09-04 04:11:02.770460", "step": 2847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 04:11:02.907840", "step": 2847, "epoch": 3 }, { "type": "loss", "content": 0.05806123465299606, "timestamp": "2025-09-04 04:11:02.934843", "step": 2848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:11:03.012846", "step": 2848, "epoch": 3 }, { "type": "loss", "content": 0.024739542976021767, "timestamp": "2025-09-04 04:11:03.028248", "step": 2849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:03.132752", "step": 2849, "epoch": 3 }, { "type": "loss", "content": 0.01195996068418026, "timestamp": "2025-09-04 04:11:03.151984", "step": 2850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:03.252141", "step": 2850, "epoch": 3 }, { "type": "loss", "content": 0.002954959636554122, "timestamp": "2025-09-04 04:11:03.271134", "step": 2851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:03.381295", "step": 2851, "epoch": 3 }, { "type": "loss", "content": 0.03290526196360588, "timestamp": "2025-09-04 04:11:03.402381", "step": 2852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:03.503914", "step": 2852, "epoch": 3 }, { "type": "loss", "content": 0.0018533534603193402, "timestamp": "2025-09-04 04:11:03.525056", "step": 2853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:03.629515", "step": 2853, "epoch": 3 }, { "type": "loss", "content": 0.006066088564693928, "timestamp": "2025-09-04 04:11:03.648528", "step": 2854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 04:11:03.822330", "step": 2854, "epoch": 3 }, { "type": "loss", "content": 0.0006023372989147902, "timestamp": "2025-09-04 04:11:03.855018", "step": 2855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:11:03.941618", "step": 2855, "epoch": 3 }, { "type": "loss", "content": 0.012206432409584522, "timestamp": "2025-09-04 04:11:03.957991", "step": 2856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:04.049387", "step": 2856, "epoch": 3 }, { "type": "loss", "content": 0.011448818258941174, "timestamp": "2025-09-04 04:11:04.068480", "step": 2857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:04.161673", "step": 2857, "epoch": 3 }, { "type": "loss", "content": 0.008118141442537308, "timestamp": "2025-09-04 04:11:04.178779", "step": 2858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:04.271839", "step": 2858, "epoch": 3 }, { "type": "loss", "content": 0.008240415714681149, "timestamp": "2025-09-04 04:11:04.288945", "step": 2859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:04.389798", "step": 2859, "epoch": 3 }, { "type": "loss", "content": 0.0023680399172008038, "timestamp": "2025-09-04 04:11:04.409541", "step": 2860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:12.908013", "step": 2860, "epoch": 3 }, { "type": "pplx", "content": 331.32267155954526, "timestamp": "2025-09-04 04:11:12.910564", "step": 2860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:13.014712", "step": 2860, "epoch": 3 }, { "type": "loss", "content": 0.0046889204531908035, "timestamp": "2025-09-04 04:11:13.037029", "step": 2861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:13.136629", "step": 2861, "epoch": 3 }, { "type": "loss", "content": 0.016678936779499054, "timestamp": "2025-09-04 04:11:13.155187", "step": 2862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:13.255837", "step": 2862, "epoch": 3 }, { "type": "loss", "content": 0.0009306335123255849, "timestamp": "2025-09-04 04:11:13.274539", "step": 2863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:11:13.384828", "step": 2863, "epoch": 3 }, { "type": "loss", "content": 0.003980133216828108, "timestamp": "2025-09-04 04:11:13.406117", "step": 2864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:13.501527", "step": 2864, "epoch": 3 }, { "type": "loss", "content": 0.0005760484491474926, "timestamp": "2025-09-04 04:11:13.520434", "step": 2865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:11:13.631198", "step": 2865, "epoch": 3 }, { "type": "loss", "content": 0.0011027660220861435, "timestamp": "2025-09-04 04:11:13.651783", "step": 2866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:11:13.735521", "step": 2866, "epoch": 3 }, { "type": "loss", "content": 0.009353280998766422, "timestamp": "2025-09-04 04:11:13.750541", "step": 2867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:11:13.852996", "step": 2867, "epoch": 3 }, { "type": "loss", "content": 0.00644304882735014, "timestamp": "2025-09-04 04:11:13.873074", "step": 2868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:11:14.072568", "step": 2868, "epoch": 3 }, { "type": "loss", "content": 0.026496384292840958, "timestamp": "2025-09-04 04:11:14.115279", "step": 2869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:11:14.233382", "step": 2869, "epoch": 3 }, { "type": "loss", "content": 0.003224861342459917, "timestamp": "2025-09-04 04:11:14.255497", "step": 2870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:14.351529", "step": 2870, "epoch": 3 }, { "type": "loss", "content": 0.011617367155849934, "timestamp": "2025-09-04 04:11:14.368918", "step": 2871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:14.476955", "step": 2871, "epoch": 3 }, { "type": "loss", "content": 0.018545642495155334, "timestamp": "2025-09-04 04:11:14.498148", "step": 2872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:14.601865", "step": 2872, "epoch": 3 }, { "type": "loss", "content": 0.0030028163455426693, "timestamp": "2025-09-04 04:11:14.623707", "step": 2873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:11:14.759811", "step": 2873, "epoch": 3 }, { "type": "loss", "content": 0.0005331755965016782, "timestamp": "2025-09-04 04:11:14.785767", "step": 2874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:11:14.871281", "step": 2874, "epoch": 3 }, { "type": "loss", "content": 0.021123895421624184, "timestamp": "2025-09-04 04:11:14.886891", "step": 2875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:11:14.977528", "step": 2875, "epoch": 3 }, { "type": "loss", "content": 0.001793418894521892, "timestamp": "2025-09-04 04:11:14.995043", "step": 2876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:15.091817", "step": 2876, "epoch": 3 }, { "type": "loss", "content": 0.005147572606801987, "timestamp": "2025-09-04 04:11:15.112275", "step": 2877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:11:15.195551", "step": 2877, "epoch": 3 }, { "type": "loss", "content": 0.042930182069540024, "timestamp": "2025-09-04 04:11:15.210752", "step": 2878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:15.303265", "step": 2878, "epoch": 3 }, { "type": "loss", "content": 0.008315377868711948, "timestamp": "2025-09-04 04:11:15.320401", "step": 2879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:11:15.391797", "step": 2879, "epoch": 3 }, { "type": "loss", "content": 0.0415649451315403, "timestamp": "2025-09-04 04:11:15.405539", "step": 2880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:23.878254", "step": 2880, "epoch": 3 }, { "type": "pplx", "content": 334.28353166069746, "timestamp": "2025-09-04 04:11:23.880247", "step": 2880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2880", "timestamp": "2025-09-04 04:11:24.404237", "step": 2880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:11:24.478633", "step": 2880, "epoch": 3 }, { "type": "loss", "content": 0.015389709733426571, "timestamp": "2025-09-04 04:11:24.493617", "step": 2881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:11:24.570800", "step": 2881, "epoch": 3 }, { "type": "loss", "content": 0.012384703382849693, "timestamp": "2025-09-04 04:11:24.584952", "step": 2882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:24.680140", "step": 2882, "epoch": 3 }, { "type": "loss", "content": 0.0006399277481250465, "timestamp": "2025-09-04 04:11:24.697690", "step": 2883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:24.791237", "step": 2883, "epoch": 3 }, { "type": "loss", "content": 0.020068077370524406, "timestamp": "2025-09-04 04:11:24.809296", "step": 2884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:24.906372", "step": 2884, "epoch": 3 }, { "type": "loss", "content": 0.006495574954897165, "timestamp": "2025-09-04 04:11:24.926876", "step": 2885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:11:25.043941", "step": 2885, "epoch": 3 }, { "type": "loss", "content": 0.03137728571891785, "timestamp": "2025-09-04 04:11:25.066254", "step": 2886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:25.159725", "step": 2886, "epoch": 3 }, { "type": "loss", "content": 0.01227316539734602, "timestamp": "2025-09-04 04:11:25.177117", "step": 2887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 04:11:25.315117", "step": 2887, "epoch": 3 }, { "type": "loss", "content": 0.0028199530206620693, "timestamp": "2025-09-04 04:11:25.342043", "step": 2888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:25.432237", "step": 2888, "epoch": 3 }, { "type": "loss", "content": 0.009654730558395386, "timestamp": "2025-09-04 04:11:25.451127", "step": 2889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:25.547980", "step": 2889, "epoch": 3 }, { "type": "loss", "content": 0.02699156291782856, "timestamp": "2025-09-04 04:11:25.565553", "step": 2890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:25.666229", "step": 2890, "epoch": 3 }, { "type": "loss", "content": 0.014877088367938995, "timestamp": "2025-09-04 04:11:25.684974", "step": 2891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:11:25.764195", "step": 2891, "epoch": 3 }, { "type": "loss", "content": 0.041199635714292526, "timestamp": "2025-09-04 04:11:25.779178", "step": 2892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:25.877779", "step": 2892, "epoch": 3 }, { "type": "loss", "content": 0.04918312653899193, "timestamp": "2025-09-04 04:11:25.898518", "step": 2893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:11:25.969548", "step": 2893, "epoch": 3 }, { "type": "loss", "content": 0.04640977457165718, "timestamp": "2025-09-04 04:11:25.982360", "step": 2894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:26.091334", "step": 2894, "epoch": 3 }, { "type": "loss", "content": 0.011765974573791027, "timestamp": "2025-09-04 04:11:26.111741", "step": 2895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:26.211303", "step": 2895, "epoch": 3 }, { "type": "loss", "content": 0.004320243373513222, "timestamp": "2025-09-04 04:11:26.230665", "step": 2896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:11:26.313778", "step": 2896, "epoch": 3 }, { "type": "loss", "content": 0.003690729383379221, "timestamp": "2025-09-04 04:11:26.330762", "step": 2897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:26.427045", "step": 2897, "epoch": 3 }, { "type": "loss", "content": 0.005680757109075785, "timestamp": "2025-09-04 04:11:26.444663", "step": 2898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:26.545200", "step": 2898, "epoch": 3 }, { "type": "loss", "content": 0.0033540872391313314, "timestamp": "2025-09-04 04:11:26.564186", "step": 2899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:11:26.679234", "step": 2899, "epoch": 3 }, { "type": "loss", "content": 0.008547638542950153, "timestamp": "2025-09-04 04:11:26.700432", "step": 2900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:35.169803", "step": 2900, "epoch": 3 }, { "type": "pplx", "content": 331.12824914190674, "timestamp": "2025-09-04 04:11:35.172146", "step": 2900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:35.267283", "step": 2900, "epoch": 3 }, { "type": "loss", "content": 0.0010944758541882038, "timestamp": "2025-09-04 04:11:35.287673", "step": 2901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:11:35.363842", "step": 2901, "epoch": 3 }, { "type": "loss", "content": 0.016356654465198517, "timestamp": "2025-09-04 04:11:35.377690", "step": 2902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:35.471661", "step": 2902, "epoch": 3 }, { "type": "loss", "content": 0.0011579522397369146, "timestamp": "2025-09-04 04:11:35.489033", "step": 2903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:35.600157", "step": 2903, "epoch": 3 }, { "type": "loss", "content": 0.0319959782063961, "timestamp": "2025-09-04 04:11:35.620991", "step": 2904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:35.721187", "step": 2904, "epoch": 3 }, { "type": "loss", "content": 0.000950302230194211, "timestamp": "2025-09-04 04:11:35.742430", "step": 2905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:35.848848", "step": 2905, "epoch": 3 }, { "type": "loss", "content": 0.001381176058202982, "timestamp": "2025-09-04 04:11:35.868867", "step": 2906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:11:35.958637", "step": 2906, "epoch": 3 }, { "type": "loss", "content": 0.0023815217427909374, "timestamp": "2025-09-04 04:11:35.975448", "step": 2907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:36.069679", "step": 2907, "epoch": 3 }, { "type": "loss", "content": 0.01757667027413845, "timestamp": "2025-09-04 04:11:36.087672", "step": 2908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:36.188807", "step": 2908, "epoch": 3 }, { "type": "loss", "content": 0.0278801117092371, "timestamp": "2025-09-04 04:11:36.210007", "step": 2909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:11:36.285728", "step": 2909, "epoch": 3 }, { "type": "loss", "content": 0.01353493519127369, "timestamp": "2025-09-04 04:11:36.299222", "step": 2910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:11:36.401642", "step": 2910, "epoch": 3 }, { "type": "loss", "content": 0.04371942579746246, "timestamp": "2025-09-04 04:11:36.420836", "step": 2911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:36.521093", "step": 2911, "epoch": 3 }, { "type": "loss", "content": 0.0061318958178162575, "timestamp": "2025-09-04 04:11:36.540465", "step": 2912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:11:36.624627", "step": 2912, "epoch": 3 }, { "type": "loss", "content": 0.057585082948207855, "timestamp": "2025-09-04 04:11:36.641737", "step": 2913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:36.745445", "step": 2913, "epoch": 3 }, { "type": "loss", "content": 0.0021131334360688925, "timestamp": "2025-09-04 04:11:36.763965", "step": 2914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:11:36.848566", "step": 2914, "epoch": 3 }, { "type": "loss", "content": 0.01308556366711855, "timestamp": "2025-09-04 04:11:36.863954", "step": 2915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:11:36.987369", "step": 2915, "epoch": 3 }, { "type": "loss", "content": 0.0034837471321225166, "timestamp": "2025-09-04 04:11:37.011142", "step": 2916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:37.105198", "step": 2916, "epoch": 3 }, { "type": "loss", "content": 0.0005953749641776085, "timestamp": "2025-09-04 04:11:37.124172", "step": 2917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:37.229787", "step": 2917, "epoch": 3 }, { "type": "loss", "content": 0.005551936570554972, "timestamp": "2025-09-04 04:11:37.249736", "step": 2918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:37.356024", "step": 2918, "epoch": 3 }, { "type": "loss", "content": 0.008271034806966782, "timestamp": "2025-09-04 04:11:37.376070", "step": 2919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:11:37.510820", "step": 2919, "epoch": 3 }, { "type": "loss", "content": 0.01085783913731575, "timestamp": "2025-09-04 04:11:37.537414", "step": 2920, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:46.026493", "step": 2920, "epoch": 3 }, { "type": "pplx", "content": 321.86002245296936, "timestamp": "2025-09-04 04:11:46.028695", "step": 2920, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2920", "timestamp": "2025-09-04 04:11:46.382176", "step": 2920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:11:46.500520", "step": 2920, "epoch": 3 }, { "type": "loss", "content": 0.0059812976978719234, "timestamp": "2025-09-04 04:11:46.525777", "step": 2921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:46.631419", "step": 2921, "epoch": 3 }, { "type": "loss", "content": 0.0009687381098046899, "timestamp": "2025-09-04 04:11:46.648895", "step": 2922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:11:46.784826", "step": 2922, "epoch": 3 }, { "type": "loss", "content": 0.005854323972016573, "timestamp": "2025-09-04 04:11:46.810927", "step": 2923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:46.910290", "step": 2923, "epoch": 3 }, { "type": "loss", "content": 0.023817606270313263, "timestamp": "2025-09-04 04:11:46.929762", "step": 2924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:11:47.036718", "step": 2924, "epoch": 3 }, { "type": "loss", "content": 0.006292128004133701, "timestamp": "2025-09-04 04:11:47.059290", "step": 2925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:11:47.161720", "step": 2925, "epoch": 3 }, { "type": "loss", "content": 0.001953437924385071, "timestamp": "2025-09-04 04:11:47.181032", "step": 2926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:11:47.257446", "step": 2926, "epoch": 3 }, { "type": "loss", "content": 0.00025311694480478764, "timestamp": "2025-09-04 04:11:47.271151", "step": 2927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:47.364263", "step": 2927, "epoch": 3 }, { "type": "loss", "content": 0.01116481889039278, "timestamp": "2025-09-04 04:11:47.382181", "step": 2928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:11:47.474507", "step": 2928, "epoch": 3 }, { "type": "loss", "content": 0.016344843432307243, "timestamp": "2025-09-04 04:11:47.493520", "step": 2929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:11:47.587785", "step": 2929, "epoch": 3 }, { "type": "loss", "content": 0.0564613938331604, "timestamp": "2025-09-04 04:11:47.605044", "step": 2930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:47.707478", "step": 2930, "epoch": 3 }, { "type": "loss", "content": 0.006602128501981497, "timestamp": "2025-09-04 04:11:47.726448", "step": 2931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:11:47.804329", "step": 2931, "epoch": 3 }, { "type": "loss", "content": 0.00032434234162792563, "timestamp": "2025-09-04 04:11:47.819267", "step": 2932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:11:47.911829", "step": 2932, "epoch": 3 }, { "type": "loss", "content": 0.011728801764547825, "timestamp": "2025-09-04 04:11:47.931177", "step": 2933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:11:48.008311", "step": 2933, "epoch": 3 }, { "type": "loss", "content": 0.03661501035094261, "timestamp": "2025-09-04 04:11:48.022294", "step": 2934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:48.126565", "step": 2934, "epoch": 3 }, { "type": "loss", "content": 0.005506326910108328, "timestamp": "2025-09-04 04:11:48.145965", "step": 2935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:11:48.248985", "step": 2935, "epoch": 3 }, { "type": "loss", "content": 0.007380081806331873, "timestamp": "2025-09-04 04:11:48.268936", "step": 2936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:11:48.356618", "step": 2936, "epoch": 3 }, { "type": "loss", "content": 0.023367907851934433, "timestamp": "2025-09-04 04:11:48.375066", "step": 2937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:11:48.484782", "step": 2937, "epoch": 3 }, { "type": "loss", "content": 0.041043538600206375, "timestamp": "2025-09-04 04:11:48.505538", "step": 2938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:48.607041", "step": 2938, "epoch": 3 }, { "type": "loss", "content": 0.014860640279948711, "timestamp": "2025-09-04 04:11:48.625778", "step": 2939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:11:48.712747", "step": 2939, "epoch": 3 }, { "type": "loss", "content": 0.05614135414361954, "timestamp": "2025-09-04 04:11:48.729282", "step": 2940, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:11:57.219177", "step": 2940, "epoch": 3 }, { "type": "pplx", "content": 315.5265659498303, "timestamp": "2025-09-04 04:11:57.221252", "step": 2940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:11:57.296365", "step": 2940, "epoch": 3 }, { "type": "loss", "content": 0.004779836628586054, "timestamp": "2025-09-04 04:11:57.311775", "step": 2941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:11:57.386172", "step": 2941, "epoch": 3 }, { "type": "loss", "content": 0.024101873859763145, "timestamp": "2025-09-04 04:11:57.399796", "step": 2942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:57.506945", "step": 2942, "epoch": 3 }, { "type": "loss", "content": 0.011341488920152187, "timestamp": "2025-09-04 04:11:57.527061", "step": 2943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:11:57.637526", "step": 2943, "epoch": 3 }, { "type": "loss", "content": 0.022627878934144974, "timestamp": "2025-09-04 04:11:57.658821", "step": 2944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:57.756279", "step": 2944, "epoch": 3 }, { "type": "loss", "content": 0.01583726704120636, "timestamp": "2025-09-04 04:11:57.777101", "step": 2945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:11:57.912462", "step": 2945, "epoch": 3 }, { "type": "loss", "content": 0.011284503154456615, "timestamp": "2025-09-04 04:11:57.938532", "step": 2946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:58.041921", "step": 2946, "epoch": 3 }, { "type": "loss", "content": 0.01882072165608406, "timestamp": "2025-09-04 04:11:58.061301", "step": 2947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:11:58.145850", "step": 2947, "epoch": 3 }, { "type": "loss", "content": 0.01440991461277008, "timestamp": "2025-09-04 04:11:58.159707", "step": 2948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:11:58.256671", "step": 2948, "epoch": 3 }, { "type": "loss", "content": 0.026601549237966537, "timestamp": "2025-09-04 04:11:58.277139", "step": 2949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:11:58.380822", "step": 2949, "epoch": 3 }, { "type": "loss", "content": 0.03433069586753845, "timestamp": "2025-09-04 04:11:58.400182", "step": 2950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:11:58.476174", "step": 2950, "epoch": 3 }, { "type": "loss", "content": 0.0034103342331945896, "timestamp": "2025-09-04 04:11:58.489905", "step": 2951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:11:58.572803", "step": 2951, "epoch": 3 }, { "type": "loss", "content": 0.005983210634440184, "timestamp": "2025-09-04 04:11:58.588627", "step": 2952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:58.690559", "step": 2952, "epoch": 3 }, { "type": "loss", "content": 0.016223527491092682, "timestamp": "2025-09-04 04:11:58.711211", "step": 2953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:11:58.798428", "step": 2953, "epoch": 3 }, { "type": "loss", "content": 0.005183096043765545, "timestamp": "2025-09-04 04:11:58.814156", "step": 2954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:11:58.900792", "step": 2954, "epoch": 3 }, { "type": "loss", "content": 0.041471634060144424, "timestamp": "2025-09-04 04:11:58.916498", "step": 2955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:11:59.026318", "step": 2955, "epoch": 3 }, { "type": "loss", "content": 0.0007432362181134522, "timestamp": "2025-09-04 04:11:59.046046", "step": 2956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:11:59.165833", "step": 2956, "epoch": 3 }, { "type": "loss", "content": 0.010432606562972069, "timestamp": "2025-09-04 04:11:59.191418", "step": 2957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:11:59.270172", "step": 2957, "epoch": 3 }, { "type": "loss", "content": 0.015087028034031391, "timestamp": "2025-09-04 04:11:59.284449", "step": 2958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:11:59.392745", "step": 2958, "epoch": 3 }, { "type": "loss", "content": 0.0005746278329752386, "timestamp": "2025-09-04 04:11:59.412683", "step": 2959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 04:11:59.546824", "step": 2959, "epoch": 3 }, { "type": "loss", "content": 0.005405406001955271, "timestamp": "2025-09-04 04:11:59.571566", "step": 2960, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:12:08.056684", "step": 2960, "epoch": 3 }, { "type": "pplx", "content": 312.525467653901, "timestamp": "2025-09-04 04:12:08.058678", "step": 2960, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2960", "timestamp": "2025-09-04 04:12:08.564968", "step": 2960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:12:08.639868", "step": 2960, "epoch": 3 }, { "type": "loss", "content": 0.011480286717414856, "timestamp": "2025-09-04 04:12:08.654576", "step": 2961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:12:08.756540", "step": 2961, "epoch": 3 }, { "type": "loss", "content": 0.01124604418873787, "timestamp": "2025-09-04 04:12:08.775864", "step": 2962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:08.876374", "step": 2962, "epoch": 3 }, { "type": "loss", "content": 0.005138123407959938, "timestamp": "2025-09-04 04:12:08.895235", "step": 2963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:12:09.005049", "step": 2963, "epoch": 3 }, { "type": "loss", "content": 0.007659485097974539, "timestamp": "2025-09-04 04:12:09.026215", "step": 2964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:12:09.134391", "step": 2964, "epoch": 3 }, { "type": "loss", "content": 0.02365208975970745, "timestamp": "2025-09-04 04:12:09.157039", "step": 2965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:09.250194", "step": 2965, "epoch": 3 }, { "type": "loss", "content": 0.0039781988598406315, "timestamp": "2025-09-04 04:12:09.267441", "step": 2966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:12:09.379255", "step": 2966, "epoch": 3 }, { "type": "loss", "content": 0.0009690428851172328, "timestamp": "2025-09-04 04:12:09.399917", "step": 2967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:09.495805", "step": 2967, "epoch": 3 }, { "type": "loss", "content": 0.01487466599792242, "timestamp": "2025-09-04 04:12:09.514085", "step": 2968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:12:09.614684", "step": 2968, "epoch": 3 }, { "type": "loss", "content": 0.08440519124269485, "timestamp": "2025-09-04 04:12:09.635750", "step": 2969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:09.734830", "step": 2969, "epoch": 3 }, { "type": "loss", "content": 0.020220091566443443, "timestamp": "2025-09-04 04:12:09.753528", "step": 2970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:09.852354", "step": 2970, "epoch": 3 }, { "type": "loss", "content": 0.010999851860105991, "timestamp": "2025-09-04 04:12:09.871044", "step": 2971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:12:09.956995", "step": 2971, "epoch": 3 }, { "type": "loss", "content": 0.022553278133273125, "timestamp": "2025-09-04 04:12:09.973369", "step": 2972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:12:10.044334", "step": 2972, "epoch": 3 }, { "type": "loss", "content": 0.0029222038574516773, "timestamp": "2025-09-04 04:12:10.058568", "step": 2973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:10.162220", "step": 2973, "epoch": 3 }, { "type": "loss", "content": 0.015705617144703865, "timestamp": "2025-09-04 04:12:10.181499", "step": 2974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 04:12:10.308836", "step": 2974, "epoch": 3 }, { "type": "loss", "content": 0.012157633900642395, "timestamp": "2025-09-04 04:12:10.333367", "step": 2975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1040 ], "flops": 20800126336064.0 }, "timestamp": "2025-09-04 04:12:10.486121", "step": 2975, "epoch": 3 }, { "type": "loss", "content": 0.016581635922193527, "timestamp": "2025-09-04 04:12:10.516149", "step": 2976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:12:10.617030", "step": 2976, "epoch": 3 }, { "type": "loss", "content": 0.012151544913649559, "timestamp": "2025-09-04 04:12:10.638199", "step": 2977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:12:10.717488", "step": 2977, "epoch": 3 }, { "type": "loss", "content": 0.00917992927134037, "timestamp": "2025-09-04 04:12:10.731631", "step": 2978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:10.836079", "step": 2978, "epoch": 3 }, { "type": "loss", "content": 0.002120188670232892, "timestamp": "2025-09-04 04:12:10.855226", "step": 2979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:10.948080", "step": 2979, "epoch": 3 }, { "type": "loss", "content": 0.0006955720018595457, "timestamp": "2025-09-04 04:12:10.965588", "step": 2980, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:12:19.458344", "step": 2980, "epoch": 3 }, { "type": "pplx", "content": 315.1214843451217, "timestamp": "2025-09-04 04:12:19.460879", "step": 2980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:19.548069", "step": 2980, "epoch": 3 }, { "type": "loss", "content": 0.003038703231140971, "timestamp": "2025-09-04 04:12:19.566567", "step": 2981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:19.678102", "step": 2981, "epoch": 3 }, { "type": "loss", "content": 0.0017276513390243053, "timestamp": "2025-09-04 04:12:19.698535", "step": 2982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:19.803796", "step": 2982, "epoch": 3 }, { "type": "loss", "content": 0.0019167335703969002, "timestamp": "2025-09-04 04:12:19.823105", "step": 2983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:19.913285", "step": 2983, "epoch": 3 }, { "type": "loss", "content": 0.004277768079191446, "timestamp": "2025-09-04 04:12:19.930819", "step": 2984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:20.021819", "step": 2984, "epoch": 3 }, { "type": "loss", "content": 0.011646011844277382, "timestamp": "2025-09-04 04:12:20.040607", "step": 2985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:12:20.146162", "step": 2985, "epoch": 3 }, { "type": "loss", "content": 0.0010585073614493012, "timestamp": "2025-09-04 04:12:20.166184", "step": 2986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:20.259593", "step": 2986, "epoch": 3 }, { "type": "loss", "content": 0.024997970089316368, "timestamp": "2025-09-04 04:12:20.276766", "step": 2987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:12:20.354448", "step": 2987, "epoch": 3 }, { "type": "loss", "content": 0.011168187484145164, "timestamp": "2025-09-04 04:12:20.369389", "step": 2988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:20.470109", "step": 2988, "epoch": 3 }, { "type": "loss", "content": 0.004837450571358204, "timestamp": "2025-09-04 04:12:20.491265", "step": 2989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:12:20.568690", "step": 2989, "epoch": 3 }, { "type": "loss", "content": 0.02616807632148266, "timestamp": "2025-09-04 04:12:20.582758", "step": 2990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:12:20.660612", "step": 2990, "epoch": 3 }, { "type": "loss", "content": 0.006141870282590389, "timestamp": "2025-09-04 04:12:20.674607", "step": 2991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:12:20.760305", "step": 2991, "epoch": 3 }, { "type": "loss", "content": 0.006691917777061462, "timestamp": "2025-09-04 04:12:20.776741", "step": 2992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:20.873019", "step": 2992, "epoch": 3 }, { "type": "loss", "content": 0.005890341941267252, "timestamp": "2025-09-04 04:12:20.893402", "step": 2993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:20.986981", "step": 2993, "epoch": 3 }, { "type": "loss", "content": 0.0004606809816323221, "timestamp": "2025-09-04 04:12:21.004539", "step": 2994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:21.107947", "step": 2994, "epoch": 3 }, { "type": "loss", "content": 0.004719878546893597, "timestamp": "2025-09-04 04:12:21.127136", "step": 2995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:12:21.237919", "step": 2995, "epoch": 3 }, { "type": "loss", "content": 0.0019325355533510447, "timestamp": "2025-09-04 04:12:21.259351", "step": 2996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:21.355991", "step": 2996, "epoch": 3 }, { "type": "loss", "content": 0.0065997205674648285, "timestamp": "2025-09-04 04:12:21.376493", "step": 2997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:21.466283", "step": 2997, "epoch": 3 }, { "type": "loss", "content": 0.02511041797697544, "timestamp": "2025-09-04 04:12:21.483091", "step": 2998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:21.585402", "step": 2998, "epoch": 3 }, { "type": "loss", "content": 0.0015789009630680084, "timestamp": "2025-09-04 04:12:21.604698", "step": 2999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:21.703899", "step": 2999, "epoch": 3 }, { "type": "loss", "content": 0.010090545751154423, "timestamp": "2025-09-04 04:12:21.723385", "step": 3000, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:12:30.115857", "step": 3000, "epoch": 3 }, { "type": "pplx", "content": 318.48539959590164, "timestamp": "2025-09-04 04:12:30.117830", "step": 3000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3000", "timestamp": "2025-09-04 04:12:30.470352", "step": 3000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:12:30.569127", "step": 3000, "epoch": 3 }, { "type": "loss", "content": 0.0071251485496759415, "timestamp": "2025-09-04 04:12:30.589877", "step": 3001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:30.682094", "step": 3001, "epoch": 3 }, { "type": "loss", "content": 0.00022971679572947323, "timestamp": "2025-09-04 04:12:30.699269", "step": 3002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:12:30.784778", "step": 3002, "epoch": 3 }, { "type": "loss", "content": 0.013213549740612507, "timestamp": "2025-09-04 04:12:30.800298", "step": 3003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:12:30.872344", "step": 3003, "epoch": 3 }, { "type": "loss", "content": 0.002448985120281577, "timestamp": "2025-09-04 04:12:30.886091", "step": 3004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:30.976093", "step": 3004, "epoch": 3 }, { "type": "loss", "content": 0.00946701131761074, "timestamp": "2025-09-04 04:12:30.994794", "step": 3005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:12:31.079497", "step": 3005, "epoch": 3 }, { "type": "loss", "content": 0.027095887809991837, "timestamp": "2025-09-04 04:12:31.095018", "step": 3006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:31.188994", "step": 3006, "epoch": 3 }, { "type": "loss", "content": 0.011162908747792244, "timestamp": "2025-09-04 04:12:31.206237", "step": 3007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:12:31.281961", "step": 3007, "epoch": 3 }, { "type": "loss", "content": 0.002971187699586153, "timestamp": "2025-09-04 04:12:31.296302", "step": 3008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:31.401735", "step": 3008, "epoch": 3 }, { "type": "loss", "content": 0.0016095120226964355, "timestamp": "2025-09-04 04:12:31.424309", "step": 3009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:12:31.531150", "step": 3009, "epoch": 3 }, { "type": "loss", "content": 0.0003574864531401545, "timestamp": "2025-09-04 04:12:31.551257", "step": 3010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:31.644083", "step": 3010, "epoch": 3 }, { "type": "loss", "content": 0.05357489734888077, "timestamp": "2025-09-04 04:12:31.660973", "step": 3011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:31.762905", "step": 3011, "epoch": 3 }, { "type": "loss", "content": 0.025260241702198982, "timestamp": "2025-09-04 04:12:31.781102", "step": 3012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:12:31.895906", "step": 3012, "epoch": 3 }, { "type": "loss", "content": 0.002299356274306774, "timestamp": "2025-09-04 04:12:31.920205", "step": 3013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:32.012619", "step": 3013, "epoch": 3 }, { "type": "loss", "content": 0.005653650965541601, "timestamp": "2025-09-04 04:12:32.029774", "step": 3014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:32.138215", "step": 3014, "epoch": 3 }, { "type": "loss", "content": 0.004577314481139183, "timestamp": "2025-09-04 04:12:32.157429", "step": 3015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:12:32.234613", "step": 3015, "epoch": 3 }, { "type": "loss", "content": 0.007822668179869652, "timestamp": "2025-09-04 04:12:32.249408", "step": 3016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:32.349594", "step": 3016, "epoch": 3 }, { "type": "loss", "content": 0.0035554529167711735, "timestamp": "2025-09-04 04:12:32.370303", "step": 3017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:12:32.446897", "step": 3017, "epoch": 3 }, { "type": "loss", "content": 0.045765411108732224, "timestamp": "2025-09-04 04:12:32.460693", "step": 3018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:12:32.597497", "step": 3018, "epoch": 3 }, { "type": "loss", "content": 0.0015078384894877672, "timestamp": "2025-09-04 04:12:32.623398", "step": 3019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:32.724410", "step": 3019, "epoch": 3 }, { "type": "loss", "content": 0.0004389840178191662, "timestamp": "2025-09-04 04:12:32.743531", "step": 3020, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:12:41.245228", "step": 3020, "epoch": 3 }, { "type": "pplx", "content": 316.8819015781872, "timestamp": "2025-09-04 04:12:41.247591", "step": 3020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:41.352031", "step": 3020, "epoch": 3 }, { "type": "loss", "content": 0.01975194923579693, "timestamp": "2025-09-04 04:12:41.374506", "step": 3021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:41.485738", "step": 3021, "epoch": 3 }, { "type": "loss", "content": 0.002616028068587184, "timestamp": "2025-09-04 04:12:41.506249", "step": 3022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:12:41.626978", "step": 3022, "epoch": 3 }, { "type": "loss", "content": 0.014300533570349216, "timestamp": "2025-09-04 04:12:41.648721", "step": 3023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:12:41.733490", "step": 3023, "epoch": 3 }, { "type": "loss", "content": 0.020479435101151466, "timestamp": "2025-09-04 04:12:41.749264", "step": 3024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:41.843852", "step": 3024, "epoch": 3 }, { "type": "loss", "content": 0.005946870427578688, "timestamp": "2025-09-04 04:12:41.863014", "step": 3025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:41.966192", "step": 3025, "epoch": 3 }, { "type": "loss", "content": 0.020112913101911545, "timestamp": "2025-09-04 04:12:41.984766", "step": 3026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:42.080065", "step": 3026, "epoch": 3 }, { "type": "loss", "content": 0.0003505939384922385, "timestamp": "2025-09-04 04:12:42.097153", "step": 3027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:12:42.212691", "step": 3027, "epoch": 3 }, { "type": "loss", "content": 0.018574239686131477, "timestamp": "2025-09-04 04:12:42.234095", "step": 3028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:42.336816", "step": 3028, "epoch": 3 }, { "type": "loss", "content": 0.006098731886595488, "timestamp": "2025-09-04 04:12:42.357909", "step": 3029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:42.466999", "step": 3029, "epoch": 3 }, { "type": "loss", "content": 0.011256312020123005, "timestamp": "2025-09-04 04:12:42.484487", "step": 3030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:42.581316", "step": 3030, "epoch": 3 }, { "type": "loss", "content": 0.02321520633995533, "timestamp": "2025-09-04 04:12:42.598704", "step": 3031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:42.695635", "step": 3031, "epoch": 3 }, { "type": "loss", "content": 0.013380464166402817, "timestamp": "2025-09-04 04:12:42.713836", "step": 3032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:12:42.827414", "step": 3032, "epoch": 3 }, { "type": "loss", "content": 0.06969982385635376, "timestamp": "2025-09-04 04:12:42.849634", "step": 3033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:42.960351", "step": 3033, "epoch": 3 }, { "type": "loss", "content": 0.001032329280860722, "timestamp": "2025-09-04 04:12:42.980870", "step": 3034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:43.081469", "step": 3034, "epoch": 3 }, { "type": "loss", "content": 0.00046730105532333255, "timestamp": "2025-09-04 04:12:43.099978", "step": 3035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:43.202230", "step": 3035, "epoch": 3 }, { "type": "loss", "content": 0.0037853161338716745, "timestamp": "2025-09-04 04:12:43.221820", "step": 3036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:12:43.303940", "step": 3036, "epoch": 3 }, { "type": "loss", "content": 0.0006179303163662553, "timestamp": "2025-09-04 04:12:43.320561", "step": 3037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:43.411798", "step": 3037, "epoch": 3 }, { "type": "loss", "content": 0.017078617587685585, "timestamp": "2025-09-04 04:12:43.428547", "step": 3038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:43.526095", "step": 3038, "epoch": 3 }, { "type": "loss", "content": 0.0008244368946179748, "timestamp": "2025-09-04 04:12:43.543550", "step": 3039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:43.648130", "step": 3039, "epoch": 3 }, { "type": "loss", "content": 0.0033969872165471315, "timestamp": "2025-09-04 04:12:43.668121", "step": 3040, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:12:52.082447", "step": 3040, "epoch": 3 }, { "type": "pplx", "content": 313.3448310983682, "timestamp": "2025-09-04 04:12:52.084610", "step": 3040, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3040", "timestamp": "2025-09-04 04:12:52.615369", "step": 3040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:52.716905", "step": 3040, "epoch": 3 }, { "type": "loss", "content": 0.007473244331777096, "timestamp": "2025-09-04 04:12:52.737524", "step": 3041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1488 ], "flops": 29760180728640.0 }, "timestamp": "2025-09-04 04:12:52.963136", "step": 3041, "epoch": 3 }, { "type": "loss", "content": 0.0020905376877635717, "timestamp": "2025-09-04 04:12:53.005306", "step": 3042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:53.111301", "step": 3042, "epoch": 3 }, { "type": "loss", "content": 0.004675985313951969, "timestamp": "2025-09-04 04:12:53.130386", "step": 3043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:53.226870", "step": 3043, "epoch": 3 }, { "type": "loss", "content": 0.004783936310559511, "timestamp": "2025-09-04 04:12:53.244571", "step": 3044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:12:53.320321", "step": 3044, "epoch": 3 }, { "type": "loss", "content": 0.024307608604431152, "timestamp": "2025-09-04 04:12:53.334873", "step": 3045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:53.448940", "step": 3045, "epoch": 3 }, { "type": "loss", "content": 0.0014469543239101768, "timestamp": "2025-09-04 04:12:53.469323", "step": 3046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:12:53.556850", "step": 3046, "epoch": 3 }, { "type": "loss", "content": 0.01129270438104868, "timestamp": "2025-09-04 04:12:53.572473", "step": 3047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:53.663644", "step": 3047, "epoch": 3 }, { "type": "loss", "content": 0.009008025750517845, "timestamp": "2025-09-04 04:12:53.681178", "step": 3048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:12:53.784218", "step": 3048, "epoch": 3 }, { "type": "loss", "content": 0.05535700172185898, "timestamp": "2025-09-04 04:12:53.806182", "step": 3049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:12:53.908028", "step": 3049, "epoch": 3 }, { "type": "loss", "content": 0.00018787295266520232, "timestamp": "2025-09-04 04:12:53.927211", "step": 3050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:12:54.021968", "step": 3050, "epoch": 3 }, { "type": "loss", "content": 0.0013808540534228086, "timestamp": "2025-09-04 04:12:54.039368", "step": 3051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:12:54.131919", "step": 3051, "epoch": 3 }, { "type": "loss", "content": 0.001885344390757382, "timestamp": "2025-09-04 04:12:54.149844", "step": 3052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:12:54.250441", "step": 3052, "epoch": 3 }, { "type": "loss", "content": 0.007413564249873161, "timestamp": "2025-09-04 04:12:54.271642", "step": 3053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:54.365785", "step": 3053, "epoch": 3 }, { "type": "loss", "content": 0.004210531245917082, "timestamp": "2025-09-04 04:12:54.383238", "step": 3054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:12:54.492486", "step": 3054, "epoch": 3 }, { "type": "loss", "content": 0.004505162592977285, "timestamp": "2025-09-04 04:12:54.513000", "step": 3055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:12:54.604034", "step": 3055, "epoch": 3 }, { "type": "loss", "content": 0.0008125255117192864, "timestamp": "2025-09-04 04:12:54.621731", "step": 3056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:12:54.702929", "step": 3056, "epoch": 3 }, { "type": "loss", "content": 0.011885403655469418, "timestamp": "2025-09-04 04:12:54.719635", "step": 3057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:12:54.814604", "step": 3057, "epoch": 3 }, { "type": "loss", "content": 0.004823492839932442, "timestamp": "2025-09-04 04:12:54.832086", "step": 3058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:12:54.932251", "step": 3058, "epoch": 3 }, { "type": "loss", "content": 0.023230794817209244, "timestamp": "2025-09-04 04:12:54.951261", "step": 3059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:12:55.052619", "step": 3059, "epoch": 3 }, { "type": "loss", "content": 0.06023210659623146, "timestamp": "2025-09-04 04:12:55.071941", "step": 3060, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:03.577234", "step": 3060, "epoch": 3 }, { "type": "pplx", "content": 310.1316931403451, "timestamp": "2025-09-04 04:13:03.579410", "step": 3060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:13:03.660065", "step": 3060, "epoch": 3 }, { "type": "loss", "content": 0.005574073176831007, "timestamp": "2025-09-04 04:13:03.676255", "step": 3061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:03.783932", "step": 3061, "epoch": 3 }, { "type": "loss", "content": 0.0004931015428155661, "timestamp": "2025-09-04 04:13:03.803610", "step": 3062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:03.910704", "step": 3062, "epoch": 3 }, { "type": "loss", "content": 0.005763449240475893, "timestamp": "2025-09-04 04:13:03.927601", "step": 3063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:13:04.032696", "step": 3063, "epoch": 3 }, { "type": "loss", "content": 0.015147325582802296, "timestamp": "2025-09-04 04:13:04.052545", "step": 3064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:04.151878", "step": 3064, "epoch": 3 }, { "type": "loss", "content": 0.013065881095826626, "timestamp": "2025-09-04 04:13:04.172337", "step": 3065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:13:04.257765", "step": 3065, "epoch": 3 }, { "type": "loss", "content": 0.06708889454603195, "timestamp": "2025-09-04 04:13:04.272762", "step": 3066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:13:04.478255", "step": 3066, "epoch": 3 }, { "type": "loss", "content": 0.0030464141163975, "timestamp": "2025-09-04 04:13:04.517184", "step": 3067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:13:04.629622", "step": 3067, "epoch": 3 }, { "type": "loss", "content": 0.004817434120923281, "timestamp": "2025-09-04 04:13:04.650836", "step": 3068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:13:04.727967", "step": 3068, "epoch": 3 }, { "type": "loss", "content": 0.010416961275041103, "timestamp": "2025-09-04 04:13:04.742894", "step": 3069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:13:04.853693", "step": 3069, "epoch": 3 }, { "type": "loss", "content": 0.01584682986140251, "timestamp": "2025-09-04 04:13:04.873990", "step": 3070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:13:04.953426", "step": 3070, "epoch": 3 }, { "type": "loss", "content": 0.0056066811084747314, "timestamp": "2025-09-04 04:13:04.967223", "step": 3071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:13:05.041133", "step": 3071, "epoch": 3 }, { "type": "loss", "content": 0.0062073455192148685, "timestamp": "2025-09-04 04:13:05.054608", "step": 3072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:13:05.176203", "step": 3072, "epoch": 3 }, { "type": "loss", "content": 0.007986130192875862, "timestamp": "2025-09-04 04:13:05.201566", "step": 3073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:13:05.280097", "step": 3073, "epoch": 3 }, { "type": "loss", "content": 0.002340728882700205, "timestamp": "2025-09-04 04:13:05.293930", "step": 3074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 04:13:05.428660", "step": 3074, "epoch": 3 }, { "type": "loss", "content": 0.01518856268376112, "timestamp": "2025-09-04 04:13:05.453088", "step": 3075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:05.553750", "step": 3075, "epoch": 3 }, { "type": "loss", "content": 0.011933338828384876, "timestamp": "2025-09-04 04:13:05.573036", "step": 3076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:05.665519", "step": 3076, "epoch": 3 }, { "type": "loss", "content": 0.019831262528896332, "timestamp": "2025-09-04 04:13:05.684346", "step": 3077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:05.779207", "step": 3077, "epoch": 3 }, { "type": "loss", "content": 0.00772702693939209, "timestamp": "2025-09-04 04:13:05.796138", "step": 3078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:13:05.886183", "step": 3078, "epoch": 3 }, { "type": "loss", "content": 0.03503193333745003, "timestamp": "2025-09-04 04:13:05.901623", "step": 3079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:05.996518", "step": 3079, "epoch": 3 }, { "type": "loss", "content": 0.03195195645093918, "timestamp": "2025-09-04 04:13:06.014248", "step": 3080, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:14.451244", "step": 3080, "epoch": 3 }, { "type": "pplx", "content": 309.69911719013754, "timestamp": "2025-09-04 04:13:14.453284", "step": 3080, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3080", "timestamp": "2025-09-04 04:13:14.996139", "step": 3080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:15.099957", "step": 3080, "epoch": 3 }, { "type": "loss", "content": 0.005432614590972662, "timestamp": "2025-09-04 04:13:15.122154", "step": 3081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:15.215538", "step": 3081, "epoch": 3 }, { "type": "loss", "content": 0.07827738672494888, "timestamp": "2025-09-04 04:13:15.232831", "step": 3082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:13:15.335844", "step": 3082, "epoch": 3 }, { "type": "loss", "content": 0.0020832966547459364, "timestamp": "2025-09-04 04:13:15.355072", "step": 3083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:15.464323", "step": 3083, "epoch": 3 }, { "type": "loss", "content": 0.003370664082467556, "timestamp": "2025-09-04 04:13:15.485209", "step": 3084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:15.590624", "step": 3084, "epoch": 3 }, { "type": "loss", "content": 0.0391119010746479, "timestamp": "2025-09-04 04:13:15.612896", "step": 3085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:15.719186", "step": 3085, "epoch": 3 }, { "type": "loss", "content": 0.0071410383097827435, "timestamp": "2025-09-04 04:13:15.739191", "step": 3086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:15.846022", "step": 3086, "epoch": 3 }, { "type": "loss", "content": 0.044355932623147964, "timestamp": "2025-09-04 04:13:15.866073", "step": 3087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:15.971664", "step": 3087, "epoch": 3 }, { "type": "loss", "content": 0.02203553542494774, "timestamp": "2025-09-04 04:13:15.992432", "step": 3088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:13:16.066807", "step": 3088, "epoch": 3 }, { "type": "loss", "content": 0.002954278141260147, "timestamp": "2025-09-04 04:13:16.081846", "step": 3089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:13:16.163704", "step": 3089, "epoch": 3 }, { "type": "loss", "content": 0.0011478732340037823, "timestamp": "2025-09-04 04:13:16.178843", "step": 3090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:13:16.274385", "step": 3090, "epoch": 3 }, { "type": "loss", "content": 0.03504842892289162, "timestamp": "2025-09-04 04:13:16.291859", "step": 3091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:13:16.368898", "step": 3091, "epoch": 3 }, { "type": "loss", "content": 0.0484900027513504, "timestamp": "2025-09-04 04:13:16.383661", "step": 3092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:16.472091", "step": 3092, "epoch": 3 }, { "type": "loss", "content": 0.02142166718840599, "timestamp": "2025-09-04 04:13:16.490533", "step": 3093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:13:16.574016", "step": 3093, "epoch": 3 }, { "type": "loss", "content": 0.0017112598288804293, "timestamp": "2025-09-04 04:13:16.589190", "step": 3094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:16.698367", "step": 3094, "epoch": 3 }, { "type": "loss", "content": 0.0018777156947180629, "timestamp": "2025-09-04 04:13:16.718616", "step": 3095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:16.818290", "step": 3095, "epoch": 3 }, { "type": "loss", "content": 0.04635737091302872, "timestamp": "2025-09-04 04:13:16.837679", "step": 3096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:13:16.918617", "step": 3096, "epoch": 3 }, { "type": "loss", "content": 0.000942113867495209, "timestamp": "2025-09-04 04:13:16.933954", "step": 3097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:13:17.010857", "step": 3097, "epoch": 3 }, { "type": "loss", "content": 0.0178525447845459, "timestamp": "2025-09-04 04:13:17.024613", "step": 3098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:13:17.100145", "step": 3098, "epoch": 3 }, { "type": "loss", "content": 0.002621579449623823, "timestamp": "2025-09-04 04:13:17.113931", "step": 3099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:13:17.197423", "step": 3099, "epoch": 3 }, { "type": "loss", "content": 0.003176899626851082, "timestamp": "2025-09-04 04:13:17.213366", "step": 3100, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:25.591497", "step": 3100, "epoch": 3 }, { "type": "pplx", "content": 309.6383679000778, "timestamp": "2025-09-04 04:13:25.594304", "step": 3100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:13:25.708607", "step": 3100, "epoch": 3 }, { "type": "loss", "content": 0.018567724153399467, "timestamp": "2025-09-04 04:13:25.732430", "step": 3101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:13:25.836648", "step": 3101, "epoch": 3 }, { "type": "loss", "content": 0.001145319314673543, "timestamp": "2025-09-04 04:13:25.855894", "step": 3102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:25.953996", "step": 3102, "epoch": 3 }, { "type": "loss", "content": 0.0030605667270720005, "timestamp": "2025-09-04 04:13:25.972593", "step": 3103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:13:26.083056", "step": 3103, "epoch": 3 }, { "type": "loss", "content": 0.003627515397965908, "timestamp": "2025-09-04 04:13:26.104379", "step": 3104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:26.195117", "step": 3104, "epoch": 3 }, { "type": "loss", "content": 0.0005803365493193269, "timestamp": "2025-09-04 04:13:26.214144", "step": 3105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:13:26.288628", "step": 3105, "epoch": 3 }, { "type": "loss", "content": 0.001379349734634161, "timestamp": "2025-09-04 04:13:26.302105", "step": 3106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:13:26.411779", "step": 3106, "epoch": 3 }, { "type": "loss", "content": 0.01873205043375492, "timestamp": "2025-09-04 04:13:26.432178", "step": 3107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:13:26.535336", "step": 3107, "epoch": 3 }, { "type": "loss", "content": 0.0028933845460414886, "timestamp": "2025-09-04 04:13:26.555296", "step": 3108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:26.660819", "step": 3108, "epoch": 3 }, { "type": "loss", "content": 0.001489466754719615, "timestamp": "2025-09-04 04:13:26.682708", "step": 3109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:13:26.757809", "step": 3109, "epoch": 3 }, { "type": "loss", "content": 0.002094303723424673, "timestamp": "2025-09-04 04:13:26.771384", "step": 3110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:26.866276", "step": 3110, "epoch": 3 }, { "type": "loss", "content": 0.0006700966041535139, "timestamp": "2025-09-04 04:13:26.883690", "step": 3111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:26.983151", "step": 3111, "epoch": 3 }, { "type": "loss", "content": 0.01674632728099823, "timestamp": "2025-09-04 04:13:27.002754", "step": 3112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:27.105954", "step": 3112, "epoch": 3 }, { "type": "loss", "content": 0.08993439376354218, "timestamp": "2025-09-04 04:13:27.127787", "step": 3113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:13:27.237145", "step": 3113, "epoch": 3 }, { "type": "loss", "content": 0.012733870185911655, "timestamp": "2025-09-04 04:13:27.257788", "step": 3114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:13:27.330723", "step": 3114, "epoch": 3 }, { "type": "loss", "content": 0.025248989462852478, "timestamp": "2025-09-04 04:13:27.343584", "step": 3115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:27.437623", "step": 3115, "epoch": 3 }, { "type": "loss", "content": 0.0020341791678220034, "timestamp": "2025-09-04 04:13:27.455792", "step": 3116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:27.547797", "step": 3116, "epoch": 3 }, { "type": "loss", "content": 0.0012790296459570527, "timestamp": "2025-09-04 04:13:27.566869", "step": 3117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:13:27.670689", "step": 3117, "epoch": 3 }, { "type": "loss", "content": 0.003262518672272563, "timestamp": "2025-09-04 04:13:27.689875", "step": 3118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:27.782359", "step": 3118, "epoch": 3 }, { "type": "loss", "content": 0.04332924634218216, "timestamp": "2025-09-04 04:13:27.799486", "step": 3119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:13:27.894946", "step": 3119, "epoch": 3 }, { "type": "loss", "content": 0.0012113949051126838, "timestamp": "2025-09-04 04:13:27.913216", "step": 3120, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:36.378564", "step": 3120, "epoch": 3 }, { "type": "pplx", "content": 307.53064608444055, "timestamp": "2025-09-04 04:13:36.380625", "step": 3120, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3120", "timestamp": "2025-09-04 04:13:36.827667", "step": 3120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:13:36.933924", "step": 3120, "epoch": 3 }, { "type": "loss", "content": 0.0004694383533205837, "timestamp": "2025-09-04 04:13:36.956474", "step": 3121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:37.050096", "step": 3121, "epoch": 3 }, { "type": "loss", "content": 0.00680502038449049, "timestamp": "2025-09-04 04:13:37.067253", "step": 3122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:37.166392", "step": 3122, "epoch": 3 }, { "type": "loss", "content": 0.009919785894453526, "timestamp": "2025-09-04 04:13:37.184994", "step": 3123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:13:37.286944", "step": 3123, "epoch": 3 }, { "type": "loss", "content": 0.016770748421549797, "timestamp": "2025-09-04 04:13:37.307060", "step": 3124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:37.398424", "step": 3124, "epoch": 3 }, { "type": "loss", "content": 0.005784905049949884, "timestamp": "2025-09-04 04:13:37.417173", "step": 3125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:37.517434", "step": 3125, "epoch": 3 }, { "type": "loss", "content": 0.02264421060681343, "timestamp": "2025-09-04 04:13:37.536320", "step": 3126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 04:13:37.600702", "step": 3126, "epoch": 3 }, { "type": "loss", "content": 0.004687449894845486, "timestamp": "2025-09-04 04:13:37.611963", "step": 3127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:13:37.728323", "step": 3127, "epoch": 3 }, { "type": "loss", "content": 0.051042910665273666, "timestamp": "2025-09-04 04:13:37.751279", "step": 3128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:37.840140", "step": 3128, "epoch": 3 }, { "type": "loss", "content": 0.0002769632264971733, "timestamp": "2025-09-04 04:13:37.858257", "step": 3129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:13:37.936618", "step": 3129, "epoch": 3 }, { "type": "loss", "content": 0.011999246664345264, "timestamp": "2025-09-04 04:13:37.950724", "step": 3130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:38.050492", "step": 3130, "epoch": 3 }, { "type": "loss", "content": 0.005194882862269878, "timestamp": "2025-09-04 04:13:38.069413", "step": 3131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:38.169541", "step": 3131, "epoch": 3 }, { "type": "loss", "content": 0.037000637501478195, "timestamp": "2025-09-04 04:13:38.189321", "step": 3132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:13:38.273497", "step": 3132, "epoch": 3 }, { "type": "loss", "content": 0.010427704080939293, "timestamp": "2025-09-04 04:13:38.290645", "step": 3133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:13:38.396605", "step": 3133, "epoch": 3 }, { "type": "loss", "content": 0.002125280909240246, "timestamp": "2025-09-04 04:13:38.416728", "step": 3134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:38.512019", "step": 3134, "epoch": 3 }, { "type": "loss", "content": 0.0023927779402583838, "timestamp": "2025-09-04 04:13:38.529572", "step": 3135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:38.628419", "step": 3135, "epoch": 3 }, { "type": "loss", "content": 0.00024334284535143524, "timestamp": "2025-09-04 04:13:38.647962", "step": 3136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:38.738054", "step": 3136, "epoch": 3 }, { "type": "loss", "content": 0.0012799968244507909, "timestamp": "2025-09-04 04:13:38.756943", "step": 3137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:13:38.859262", "step": 3137, "epoch": 3 }, { "type": "loss", "content": 0.010953391902148724, "timestamp": "2025-09-04 04:13:38.878460", "step": 3138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:13:38.956198", "step": 3138, "epoch": 3 }, { "type": "loss", "content": 0.0005597640410996974, "timestamp": "2025-09-04 04:13:38.970039", "step": 3139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:39.060351", "step": 3139, "epoch": 3 }, { "type": "loss", "content": 0.017131542786955833, "timestamp": "2025-09-04 04:13:39.077857", "step": 3140, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:47.459705", "step": 3140, "epoch": 3 }, { "type": "pplx", "content": 303.543175064668, "timestamp": "2025-09-04 04:13:47.461952", "step": 3140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:13:47.544243", "step": 3140, "epoch": 3 }, { "type": "loss", "content": 0.005871990229934454, "timestamp": "2025-09-04 04:13:47.561346", "step": 3141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:13:47.683316", "step": 3141, "epoch": 3 }, { "type": "loss", "content": 0.004386106040328741, "timestamp": "2025-09-04 04:13:47.706499", "step": 3142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:47.800908", "step": 3142, "epoch": 3 }, { "type": "loss", "content": 0.0068616243079304695, "timestamp": "2025-09-04 04:13:47.818294", "step": 3143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:47.911953", "step": 3143, "epoch": 3 }, { "type": "loss", "content": 0.0012470950605347753, "timestamp": "2025-09-04 04:13:47.930050", "step": 3144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:13:48.062441", "step": 3144, "epoch": 3 }, { "type": "loss", "content": 0.008273441344499588, "timestamp": "2025-09-04 04:13:48.090809", "step": 3145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:48.190377", "step": 3145, "epoch": 3 }, { "type": "loss", "content": 0.05135876312851906, "timestamp": "2025-09-04 04:13:48.209014", "step": 3146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:48.301863", "step": 3146, "epoch": 3 }, { "type": "loss", "content": 0.026613635942339897, "timestamp": "2025-09-04 04:13:48.318989", "step": 3147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:48.427395", "step": 3147, "epoch": 3 }, { "type": "loss", "content": 0.006781783886253834, "timestamp": "2025-09-04 04:13:48.448382", "step": 3148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:48.540151", "step": 3148, "epoch": 3 }, { "type": "loss", "content": 0.0013767415657639503, "timestamp": "2025-09-04 04:13:48.558954", "step": 3149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:13:48.663250", "step": 3149, "epoch": 3 }, { "type": "loss", "content": 0.06130528450012207, "timestamp": "2025-09-04 04:13:48.682512", "step": 3150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:13:48.792712", "step": 3150, "epoch": 3 }, { "type": "loss", "content": 0.011846323497593403, "timestamp": "2025-09-04 04:13:48.813358", "step": 3151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:48.904857", "step": 3151, "epoch": 3 }, { "type": "loss", "content": 0.015273387543857098, "timestamp": "2025-09-04 04:13:48.922531", "step": 3152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:13:49.043109", "step": 3152, "epoch": 3 }, { "type": "loss", "content": 0.0032542271073907614, "timestamp": "2025-09-04 04:13:49.068724", "step": 3153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:49.161574", "step": 3153, "epoch": 3 }, { "type": "loss", "content": 0.002285633934661746, "timestamp": "2025-09-04 04:13:49.178731", "step": 3154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:13:49.280117", "step": 3154, "epoch": 3 }, { "type": "loss", "content": 0.009557440876960754, "timestamp": "2025-09-04 04:13:49.299107", "step": 3155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:49.389864", "step": 3155, "epoch": 3 }, { "type": "loss", "content": 0.018284492194652557, "timestamp": "2025-09-04 04:13:49.407496", "step": 3156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:13:49.498858", "step": 3156, "epoch": 3 }, { "type": "loss", "content": 0.0062835500575602055, "timestamp": "2025-09-04 04:13:49.517799", "step": 3157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:13:49.595561", "step": 3157, "epoch": 3 }, { "type": "loss", "content": 0.006542867515236139, "timestamp": "2025-09-04 04:13:49.609201", "step": 3158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:49.708783", "step": 3158, "epoch": 3 }, { "type": "loss", "content": 0.004803582560271025, "timestamp": "2025-09-04 04:13:49.727388", "step": 3159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:13:49.813834", "step": 3159, "epoch": 3 }, { "type": "loss", "content": 0.005237384233623743, "timestamp": "2025-09-04 04:13:49.830356", "step": 3160, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:13:58.274255", "step": 3160, "epoch": 3 }, { "type": "pplx", "content": 299.2577897885672, "timestamp": "2025-09-04 04:13:58.276754", "step": 3160, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3160", "timestamp": "2025-09-04 04:13:58.818167", "step": 3160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:13:58.901615", "step": 3160, "epoch": 3 }, { "type": "loss", "content": 0.012152140960097313, "timestamp": "2025-09-04 04:13:58.917815", "step": 3161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:59.010068", "step": 3161, "epoch": 3 }, { "type": "loss", "content": 0.0011444678530097008, "timestamp": "2025-09-04 04:13:59.026556", "step": 3162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:13:59.127832", "step": 3162, "epoch": 3 }, { "type": "loss", "content": 0.0307242963463068, "timestamp": "2025-09-04 04:13:59.146322", "step": 3163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:13:59.262752", "step": 3163, "epoch": 3 }, { "type": "loss", "content": 0.009461762383580208, "timestamp": "2025-09-04 04:13:59.284146", "step": 3164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:13:59.373507", "step": 3164, "epoch": 3 }, { "type": "loss", "content": 0.0035491350572556257, "timestamp": "2025-09-04 04:13:59.391646", "step": 3165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:13:59.501388", "step": 3165, "epoch": 3 }, { "type": "loss", "content": 0.0005383518873713911, "timestamp": "2025-09-04 04:13:59.521601", "step": 3166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:13:59.632898", "step": 3166, "epoch": 3 }, { "type": "loss", "content": 0.005555047653615475, "timestamp": "2025-09-04 04:13:59.653292", "step": 3167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:13:59.738310", "step": 3167, "epoch": 3 }, { "type": "loss", "content": 0.0026055641938000917, "timestamp": "2025-09-04 04:13:59.754242", "step": 3168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:13:59.846632", "step": 3168, "epoch": 3 }, { "type": "loss", "content": 0.007750915363430977, "timestamp": "2025-09-04 04:13:59.865546", "step": 3169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:13:59.975571", "step": 3169, "epoch": 3 }, { "type": "loss", "content": 0.030947135761380196, "timestamp": "2025-09-04 04:13:59.996080", "step": 3170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:00.095959", "step": 3170, "epoch": 3 }, { "type": "loss", "content": 0.008231641724705696, "timestamp": "2025-09-04 04:14:00.113266", "step": 3171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:00.217518", "step": 3171, "epoch": 3 }, { "type": "loss", "content": 0.0042179482989013195, "timestamp": "2025-09-04 04:14:00.237314", "step": 3172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:14:00.346381", "step": 3172, "epoch": 3 }, { "type": "loss", "content": 0.0010643589776009321, "timestamp": "2025-09-04 04:14:00.368931", "step": 3173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:00.473465", "step": 3173, "epoch": 3 }, { "type": "loss", "content": 0.01745988056063652, "timestamp": "2025-09-04 04:14:00.492705", "step": 3174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:14:00.586556", "step": 3174, "epoch": 3 }, { "type": "loss", "content": 0.040873829275369644, "timestamp": "2025-09-04 04:14:00.603677", "step": 3175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:00.705209", "step": 3175, "epoch": 3 }, { "type": "loss", "content": 0.0004095417389180511, "timestamp": "2025-09-04 04:14:00.724761", "step": 3176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:00.826009", "step": 3176, "epoch": 3 }, { "type": "loss", "content": 0.05139699578285217, "timestamp": "2025-09-04 04:14:00.847061", "step": 3177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:00.947870", "step": 3177, "epoch": 3 }, { "type": "loss", "content": 0.01573573239147663, "timestamp": "2025-09-04 04:14:00.966290", "step": 3178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:14:01.038915", "step": 3178, "epoch": 3 }, { "type": "loss", "content": 0.004238112363964319, "timestamp": "2025-09-04 04:14:01.051828", "step": 3179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:01.146291", "step": 3179, "epoch": 3 }, { "type": "loss", "content": 0.013266210444271564, "timestamp": "2025-09-04 04:14:01.164460", "step": 3180, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:14:09.548016", "step": 3180, "epoch": 3 }, { "type": "pplx", "content": 297.72411337269, "timestamp": "2025-09-04 04:14:09.549979", "step": 3180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:09.645901", "step": 3180, "epoch": 3 }, { "type": "loss", "content": 0.004828155972063541, "timestamp": "2025-09-04 04:14:09.666733", "step": 3181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:14:09.737653", "step": 3181, "epoch": 3 }, { "type": "loss", "content": 0.00019382215396035463, "timestamp": "2025-09-04 04:14:09.750394", "step": 3182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:09.843532", "step": 3182, "epoch": 3 }, { "type": "loss", "content": 0.012143196538090706, "timestamp": "2025-09-04 04:14:09.860957", "step": 3183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:09.966472", "step": 3183, "epoch": 3 }, { "type": "loss", "content": 0.012787654995918274, "timestamp": "2025-09-04 04:14:09.987350", "step": 3184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:10.101021", "step": 3184, "epoch": 3 }, { "type": "loss", "content": 0.0021711078006774187, "timestamp": "2025-09-04 04:14:10.122082", "step": 3185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:14:10.199346", "step": 3185, "epoch": 3 }, { "type": "loss", "content": 0.016758840531110764, "timestamp": "2025-09-04 04:14:10.213436", "step": 3186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:10.318855", "step": 3186, "epoch": 3 }, { "type": "loss", "content": 0.0012713070027530193, "timestamp": "2025-09-04 04:14:10.338888", "step": 3187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:10.441584", "step": 3187, "epoch": 3 }, { "type": "loss", "content": 0.002686247928068042, "timestamp": "2025-09-04 04:14:10.461620", "step": 3188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:14:10.567666", "step": 3188, "epoch": 3 }, { "type": "loss", "content": 0.009823828935623169, "timestamp": "2025-09-04 04:14:10.590298", "step": 3189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:14:10.673129", "step": 3189, "epoch": 3 }, { "type": "loss", "content": 0.011689892038702965, "timestamp": "2025-09-04 04:14:10.688286", "step": 3190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:14:10.771771", "step": 3190, "epoch": 3 }, { "type": "loss", "content": 0.0027343537658452988, "timestamp": "2025-09-04 04:14:10.787075", "step": 3191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:10.886962", "step": 3191, "epoch": 3 }, { "type": "loss", "content": 0.008726535364985466, "timestamp": "2025-09-04 04:14:10.906597", "step": 3192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:11.007668", "step": 3192, "epoch": 3 }, { "type": "loss", "content": 0.006816827226430178, "timestamp": "2025-09-04 04:14:11.028767", "step": 3193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:14:11.141248", "step": 3193, "epoch": 3 }, { "type": "loss", "content": 0.00433404790237546, "timestamp": "2025-09-04 04:14:11.161661", "step": 3194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:14:11.271042", "step": 3194, "epoch": 3 }, { "type": "loss", "content": 0.005224962718784809, "timestamp": "2025-09-04 04:14:11.291628", "step": 3195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:11.402317", "step": 3195, "epoch": 3 }, { "type": "loss", "content": 0.00549314683303237, "timestamp": "2025-09-04 04:14:11.421990", "step": 3196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:11.512659", "step": 3196, "epoch": 3 }, { "type": "loss", "content": 0.015487665310502052, "timestamp": "2025-09-04 04:14:11.531859", "step": 3197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:11.627288", "step": 3197, "epoch": 3 }, { "type": "loss", "content": 0.005331105552613735, "timestamp": "2025-09-04 04:14:11.644713", "step": 3198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:11.745595", "step": 3198, "epoch": 3 }, { "type": "loss", "content": 0.0038703870959579945, "timestamp": "2025-09-04 04:14:11.764490", "step": 3199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:14:11.841979", "step": 3199, "epoch": 3 }, { "type": "loss", "content": 0.013186764903366566, "timestamp": "2025-09-04 04:14:11.856696", "step": 3200, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:14:20.282753", "step": 3200, "epoch": 3 }, { "type": "pplx", "content": 298.56920811002476, "timestamp": "2025-09-04 04:14:20.284562", "step": 3200, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3200", "timestamp": "2025-09-04 04:14:20.791042", "step": 3200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:14:20.863498", "step": 3200, "epoch": 3 }, { "type": "loss", "content": 0.027715997770428658, "timestamp": "2025-09-04 04:14:20.878154", "step": 3201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:20.982383", "step": 3201, "epoch": 3 }, { "type": "loss", "content": 0.0028067301027476788, "timestamp": "2025-09-04 04:14:21.001527", "step": 3202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:21.100913", "step": 3202, "epoch": 3 }, { "type": "loss", "content": 0.0029784520156681538, "timestamp": "2025-09-04 04:14:21.119598", "step": 3203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:21.215903", "step": 3203, "epoch": 3 }, { "type": "loss", "content": 0.0007651972700841725, "timestamp": "2025-09-04 04:14:21.234311", "step": 3204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:21.335668", "step": 3204, "epoch": 3 }, { "type": "loss", "content": 0.03393160179257393, "timestamp": "2025-09-04 04:14:21.356658", "step": 3205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:21.452878", "step": 3205, "epoch": 3 }, { "type": "loss", "content": 0.0033680200576782227, "timestamp": "2025-09-04 04:14:21.470525", "step": 3206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:14:21.556580", "step": 3206, "epoch": 3 }, { "type": "loss", "content": 0.022258544340729713, "timestamp": "2025-09-04 04:14:21.572030", "step": 3207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:21.666815", "step": 3207, "epoch": 3 }, { "type": "loss", "content": 0.000977307092398405, "timestamp": "2025-09-04 04:14:21.685202", "step": 3208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:14:21.802759", "step": 3208, "epoch": 3 }, { "type": "loss", "content": 0.009067521430552006, "timestamp": "2025-09-04 04:14:21.826562", "step": 3209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:21.930639", "step": 3209, "epoch": 3 }, { "type": "loss", "content": 0.005365348886698484, "timestamp": "2025-09-04 04:14:21.949872", "step": 3210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:22.049801", "step": 3210, "epoch": 3 }, { "type": "loss", "content": 0.016756800934672356, "timestamp": "2025-09-04 04:14:22.068544", "step": 3211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:14:22.152516", "step": 3211, "epoch": 3 }, { "type": "loss", "content": 0.0015038455603644252, "timestamp": "2025-09-04 04:14:22.167154", "step": 3212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:14:22.248469", "step": 3212, "epoch": 3 }, { "type": "loss", "content": 0.002096136100590229, "timestamp": "2025-09-04 04:14:22.265055", "step": 3213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:14:22.347176", "step": 3213, "epoch": 3 }, { "type": "loss", "content": 0.006991859059780836, "timestamp": "2025-09-04 04:14:22.362368", "step": 3214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:22.465008", "step": 3214, "epoch": 3 }, { "type": "loss", "content": 0.004486733116209507, "timestamp": "2025-09-04 04:14:22.484327", "step": 3215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:14:22.574918", "step": 3215, "epoch": 3 }, { "type": "loss", "content": 0.015569731593132019, "timestamp": "2025-09-04 04:14:22.592568", "step": 3216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:22.690066", "step": 3216, "epoch": 3 }, { "type": "loss", "content": 0.02023530937731266, "timestamp": "2025-09-04 04:14:22.710895", "step": 3217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:14:22.834269", "step": 3217, "epoch": 3 }, { "type": "loss", "content": 0.009704073891043663, "timestamp": "2025-09-04 04:14:22.857521", "step": 3218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:22.960755", "step": 3218, "epoch": 3 }, { "type": "loss", "content": 0.00012312929902691394, "timestamp": "2025-09-04 04:14:22.980128", "step": 3219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:14:23.101629", "step": 3219, "epoch": 3 }, { "type": "loss", "content": 0.0024548498913645744, "timestamp": "2025-09-04 04:14:23.125454", "step": 3220, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:14:31.604710", "step": 3220, "epoch": 3 }, { "type": "pplx", "content": 302.84948080396117, "timestamp": "2025-09-04 04:14:31.606931", "step": 3220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:14:31.688935", "step": 3220, "epoch": 3 }, { "type": "loss", "content": 0.03970217704772949, "timestamp": "2025-09-04 04:14:31.706195", "step": 3221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:31.806815", "step": 3221, "epoch": 3 }, { "type": "loss", "content": 0.0249653160572052, "timestamp": "2025-09-04 04:14:31.825636", "step": 3222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:14:31.903292", "step": 3222, "epoch": 3 }, { "type": "loss", "content": 0.00563463568687439, "timestamp": "2025-09-04 04:14:31.917510", "step": 3223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:14:32.010566", "step": 3223, "epoch": 3 }, { "type": "loss", "content": 0.012554388493299484, "timestamp": "2025-09-04 04:14:32.028457", "step": 3224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:14:32.118821", "step": 3224, "epoch": 3 }, { "type": "loss", "content": 0.045332036912441254, "timestamp": "2025-09-04 04:14:32.137174", "step": 3225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:32.238227", "step": 3225, "epoch": 3 }, { "type": "loss", "content": 0.0016650618053972721, "timestamp": "2025-09-04 04:14:32.257069", "step": 3226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:32.350727", "step": 3226, "epoch": 3 }, { "type": "loss", "content": 0.0041397614404559135, "timestamp": "2025-09-04 04:14:32.368149", "step": 3227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:32.462491", "step": 3227, "epoch": 3 }, { "type": "loss", "content": 0.0027826486621052027, "timestamp": "2025-09-04 04:14:32.480741", "step": 3228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:14:32.568652", "step": 3228, "epoch": 3 }, { "type": "loss", "content": 0.007442697882652283, "timestamp": "2025-09-04 04:14:32.587093", "step": 3229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:14:32.665351", "step": 3229, "epoch": 3 }, { "type": "loss", "content": 0.006308171898126602, "timestamp": "2025-09-04 04:14:32.679438", "step": 3230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:32.772202", "step": 3230, "epoch": 3 }, { "type": "loss", "content": 0.0004488340055104345, "timestamp": "2025-09-04 04:14:32.789529", "step": 3231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:14:32.866260", "step": 3231, "epoch": 3 }, { "type": "loss", "content": 0.0021919619757682085, "timestamp": "2025-09-04 04:14:32.881055", "step": 3232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:14:32.971578", "step": 3232, "epoch": 3 }, { "type": "loss", "content": 0.007420377805829048, "timestamp": "2025-09-04 04:14:32.990460", "step": 3233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:33.092494", "step": 3233, "epoch": 3 }, { "type": "loss", "content": 0.002644237130880356, "timestamp": "2025-09-04 04:14:33.111654", "step": 3234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:14:33.247913", "step": 3234, "epoch": 3 }, { "type": "loss", "content": 0.0011826629051938653, "timestamp": "2025-09-04 04:14:33.274252", "step": 3235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:33.375785", "step": 3235, "epoch": 3 }, { "type": "loss", "content": 0.02761908806860447, "timestamp": "2025-09-04 04:14:33.395525", "step": 3236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:33.497828", "step": 3236, "epoch": 3 }, { "type": "loss", "content": 0.06943929940462112, "timestamp": "2025-09-04 04:14:33.519104", "step": 3237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:33.618795", "step": 3237, "epoch": 3 }, { "type": "loss", "content": 0.012753068469464779, "timestamp": "2025-09-04 04:14:33.637467", "step": 3238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:33.741847", "step": 3238, "epoch": 3 }, { "type": "loss", "content": 0.008138732053339481, "timestamp": "2025-09-04 04:14:33.761229", "step": 3239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:33.864802", "step": 3239, "epoch": 3 }, { "type": "loss", "content": 0.0004093741299584508, "timestamp": "2025-09-04 04:14:33.884886", "step": 3240, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:14:42.381848", "step": 3240, "epoch": 3 }, { "type": "pplx", "content": 302.56651160009335, "timestamp": "2025-09-04 04:14:42.383805", "step": 3240, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3240", "timestamp": "2025-09-04 04:14:42.856128", "step": 3240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:14:42.923762", "step": 3240, "epoch": 3 }, { "type": "loss", "content": 0.003945726901292801, "timestamp": "2025-09-04 04:14:42.937279", "step": 3241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:14:43.015815", "step": 3241, "epoch": 3 }, { "type": "loss", "content": 0.0013066886458545923, "timestamp": "2025-09-04 04:14:43.030023", "step": 3242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:43.128688", "step": 3242, "epoch": 3 }, { "type": "loss", "content": 0.005977214314043522, "timestamp": "2025-09-04 04:14:43.147386", "step": 3243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:43.247664", "step": 3243, "epoch": 3 }, { "type": "loss", "content": 0.006708620116114616, "timestamp": "2025-09-04 04:14:43.267330", "step": 3244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:43.366677", "step": 3244, "epoch": 3 }, { "type": "loss", "content": 0.010317733511328697, "timestamp": "2025-09-04 04:14:43.387825", "step": 3245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:14:43.497352", "step": 3245, "epoch": 3 }, { "type": "loss", "content": 0.001994991209357977, "timestamp": "2025-09-04 04:14:43.518042", "step": 3246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:43.613568", "step": 3246, "epoch": 3 }, { "type": "loss", "content": 0.0030987162608653307, "timestamp": "2025-09-04 04:14:43.631222", "step": 3247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:14:43.740095", "step": 3247, "epoch": 3 }, { "type": "loss", "content": 0.0005180987645871937, "timestamp": "2025-09-04 04:14:43.761304", "step": 3248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:43.852971", "step": 3248, "epoch": 3 }, { "type": "loss", "content": 0.00039079232374206185, "timestamp": "2025-09-04 04:14:43.872164", "step": 3249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:43.966444", "step": 3249, "epoch": 3 }, { "type": "loss", "content": 0.00027972026146017015, "timestamp": "2025-09-04 04:14:43.984013", "step": 3250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:14:44.093314", "step": 3250, "epoch": 3 }, { "type": "loss", "content": 0.0020345128141343594, "timestamp": "2025-09-04 04:14:44.113985", "step": 3251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:44.214840", "step": 3251, "epoch": 3 }, { "type": "loss", "content": 0.008928967639803886, "timestamp": "2025-09-04 04:14:44.234614", "step": 3252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:44.326751", "step": 3252, "epoch": 3 }, { "type": "loss", "content": 0.0039917477406561375, "timestamp": "2025-09-04 04:14:44.346001", "step": 3253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:14:44.448569", "step": 3253, "epoch": 3 }, { "type": "loss", "content": 0.0008001399110071361, "timestamp": "2025-09-04 04:14:44.467465", "step": 3254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:44.562224", "step": 3254, "epoch": 3 }, { "type": "loss", "content": 0.0018813287606462836, "timestamp": "2025-09-04 04:14:44.579611", "step": 3255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:14:44.665229", "step": 3255, "epoch": 3 }, { "type": "loss", "content": 0.0071727619506418705, "timestamp": "2025-09-04 04:14:44.681606", "step": 3256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:14:44.755422", "step": 3256, "epoch": 3 }, { "type": "loss", "content": 0.019573703408241272, "timestamp": "2025-09-04 04:14:44.770563", "step": 3257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:14:44.879668", "step": 3257, "epoch": 3 }, { "type": "loss", "content": 0.010637468658387661, "timestamp": "2025-09-04 04:14:44.900200", "step": 3258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:44.995225", "step": 3258, "epoch": 3 }, { "type": "loss", "content": 0.0005085250595584512, "timestamp": "2025-09-04 04:14:45.012783", "step": 3259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:45.118189", "step": 3259, "epoch": 3 }, { "type": "loss", "content": 0.004459428135305643, "timestamp": "2025-09-04 04:14:45.139052", "step": 3260, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:14:53.619951", "step": 3260, "epoch": 3 }, { "type": "pplx", "content": 303.6349730675357, "timestamp": "2025-09-04 04:14:53.621797", "step": 3260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:53.720593", "step": 3260, "epoch": 3 }, { "type": "loss", "content": 0.007222423795610666, "timestamp": "2025-09-04 04:14:53.741702", "step": 3261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:14:53.833132", "step": 3261, "epoch": 3 }, { "type": "loss", "content": 0.0010594201739877462, "timestamp": "2025-09-04 04:14:53.849926", "step": 3262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:53.949469", "step": 3262, "epoch": 3 }, { "type": "loss", "content": 0.002619499806314707, "timestamp": "2025-09-04 04:14:53.968160", "step": 3263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:14:54.069273", "step": 3263, "epoch": 3 }, { "type": "loss", "content": 0.002300729276612401, "timestamp": "2025-09-04 04:14:54.088775", "step": 3264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:14:54.193743", "step": 3264, "epoch": 3 }, { "type": "loss", "content": 0.0029513502959161997, "timestamp": "2025-09-04 04:14:54.215967", "step": 3265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:54.311347", "step": 3265, "epoch": 3 }, { "type": "loss", "content": 0.0033043273724615574, "timestamp": "2025-09-04 04:14:54.328764", "step": 3266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:14:54.423680", "step": 3266, "epoch": 3 }, { "type": "loss", "content": 0.0076232897117733955, "timestamp": "2025-09-04 04:14:54.441064", "step": 3267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:14:54.549806", "step": 3267, "epoch": 3 }, { "type": "loss", "content": 0.012323771603405476, "timestamp": "2025-09-04 04:14:54.570969", "step": 3268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:54.677234", "step": 3268, "epoch": 3 }, { "type": "loss", "content": 0.014858097769320011, "timestamp": "2025-09-04 04:14:54.699205", "step": 3269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:14:54.771795", "step": 3269, "epoch": 3 }, { "type": "loss", "content": 0.006694823037832975, "timestamp": "2025-09-04 04:14:54.784684", "step": 3270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:14:54.870959", "step": 3270, "epoch": 3 }, { "type": "loss", "content": 0.0045067863538861275, "timestamp": "2025-09-04 04:14:54.886529", "step": 3271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:54.994351", "step": 3271, "epoch": 3 }, { "type": "loss", "content": 0.0029527172446250916, "timestamp": "2025-09-04 04:14:55.015092", "step": 3272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:14:55.106218", "step": 3272, "epoch": 3 }, { "type": "loss", "content": 0.010578243993222713, "timestamp": "2025-09-04 04:14:55.124807", "step": 3273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:14:55.217610", "step": 3273, "epoch": 3 }, { "type": "loss", "content": 0.004275831393897533, "timestamp": "2025-09-04 04:14:55.234893", "step": 3274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:14:55.338811", "step": 3274, "epoch": 3 }, { "type": "loss", "content": 0.004477000795304775, "timestamp": "2025-09-04 04:14:55.358098", "step": 3275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:14:55.454527", "step": 3275, "epoch": 3 }, { "type": "loss", "content": 0.003879428841173649, "timestamp": "2025-09-04 04:14:55.472942", "step": 3276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:14:55.577939", "step": 3276, "epoch": 3 }, { "type": "loss", "content": 0.06313685327768326, "timestamp": "2025-09-04 04:14:55.599965", "step": 3277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 04:14:55.763310", "step": 3277, "epoch": 3 }, { "type": "loss", "content": 0.002089696703478694, "timestamp": "2025-09-04 04:14:55.795506", "step": 3278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:14:55.880590", "step": 3278, "epoch": 3 }, { "type": "loss", "content": 0.0007018198375590146, "timestamp": "2025-09-04 04:14:55.895788", "step": 3279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:14:55.998173", "step": 3279, "epoch": 3 }, { "type": "loss", "content": 0.02527954801917076, "timestamp": "2025-09-04 04:14:56.018237", "step": 3280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:04.467066", "step": 3280, "epoch": 3 }, { "type": "pplx", "content": 306.23200739000436, "timestamp": "2025-09-04 04:15:04.469448", "step": 3280, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3280", "timestamp": "2025-09-04 04:15:04.982903", "step": 3280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:15:05.079346", "step": 3280, "epoch": 3 }, { "type": "loss", "content": 0.03897909075021744, "timestamp": "2025-09-04 04:15:05.099648", "step": 3281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:05.202261", "step": 3281, "epoch": 3 }, { "type": "loss", "content": 0.0016802679747343063, "timestamp": "2025-09-04 04:15:05.221465", "step": 3282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:05.329841", "step": 3282, "epoch": 3 }, { "type": "loss", "content": 0.002108318265527487, "timestamp": "2025-09-04 04:15:05.349197", "step": 3283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:05.452573", "step": 3283, "epoch": 3 }, { "type": "loss", "content": 0.00026367095415480435, "timestamp": "2025-09-04 04:15:05.472578", "step": 3284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:15:05.563126", "step": 3284, "epoch": 3 }, { "type": "loss", "content": 0.0026424103416502476, "timestamp": "2025-09-04 04:15:05.581936", "step": 3285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:15:05.699833", "step": 3285, "epoch": 3 }, { "type": "loss", "content": 0.00553141999989748, "timestamp": "2025-09-04 04:15:05.721891", "step": 3286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:05.814069", "step": 3286, "epoch": 3 }, { "type": "loss", "content": 0.009483684785664082, "timestamp": "2025-09-04 04:15:05.830807", "step": 3287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:15:05.926426", "step": 3287, "epoch": 3 }, { "type": "loss", "content": 0.0044782888144254684, "timestamp": "2025-09-04 04:15:05.944699", "step": 3288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:15:06.061273", "step": 3288, "epoch": 3 }, { "type": "loss", "content": 0.0009773524943739176, "timestamp": "2025-09-04 04:15:06.085178", "step": 3289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:06.192109", "step": 3289, "epoch": 3 }, { "type": "loss", "content": 0.00231315684504807, "timestamp": "2025-09-04 04:15:06.212110", "step": 3290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:06.305889", "step": 3290, "epoch": 3 }, { "type": "loss", "content": 0.0034602778032422066, "timestamp": "2025-09-04 04:15:06.320028", "step": 3291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:15:06.402443", "step": 3291, "epoch": 3 }, { "type": "loss", "content": 0.005539227742701769, "timestamp": "2025-09-04 04:15:06.418345", "step": 3292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:15:06.498804", "step": 3292, "epoch": 3 }, { "type": "loss", "content": 0.0033372659236192703, "timestamp": "2025-09-04 04:15:06.515338", "step": 3293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:06.635892", "step": 3293, "epoch": 3 }, { "type": "loss", "content": 0.0007293337839655578, "timestamp": "2025-09-04 04:15:06.655876", "step": 3294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:15:06.791530", "step": 3294, "epoch": 3 }, { "type": "loss", "content": 0.0006761676049791276, "timestamp": "2025-09-04 04:15:06.817475", "step": 3295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:15:06.904333", "step": 3295, "epoch": 3 }, { "type": "loss", "content": 0.032214339822530746, "timestamp": "2025-09-04 04:15:06.920789", "step": 3296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:15:06.994544", "step": 3296, "epoch": 3 }, { "type": "loss", "content": 0.004622759763151407, "timestamp": "2025-09-04 04:15:07.009354", "step": 3297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 448 ], "flops": 8960054460160.0 }, "timestamp": "2025-09-04 04:15:07.081752", "step": 3297, "epoch": 3 }, { "type": "loss", "content": 0.00865192525088787, "timestamp": "2025-09-04 04:15:07.094680", "step": 3298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:15:07.190603", "step": 3298, "epoch": 3 }, { "type": "loss", "content": 0.004962395876646042, "timestamp": "2025-09-04 04:15:07.208103", "step": 3299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:15:07.306707", "step": 3299, "epoch": 3 }, { "type": "loss", "content": 0.004638133570551872, "timestamp": "2025-09-04 04:15:07.324605", "step": 3300, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:15.708785", "step": 3300, "epoch": 3 }, { "type": "pplx", "content": 310.93065235576375, "timestamp": "2025-09-04 04:15:15.710762", "step": 3300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:15.809849", "step": 3300, "epoch": 3 }, { "type": "loss", "content": 0.00043890104279853404, "timestamp": "2025-09-04 04:15:15.831027", "step": 3301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:15.936809", "step": 3301, "epoch": 3 }, { "type": "loss", "content": 0.031615160405635834, "timestamp": "2025-09-04 04:15:15.956837", "step": 3302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:16.034876", "step": 3302, "epoch": 3 }, { "type": "loss", "content": 0.012209202162921429, "timestamp": "2025-09-04 04:15:16.048981", "step": 3303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:16.139795", "step": 3303, "epoch": 3 }, { "type": "loss", "content": 0.01653936877846718, "timestamp": "2025-09-04 04:15:16.157321", "step": 3304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:16.255858", "step": 3304, "epoch": 3 }, { "type": "loss", "content": 0.016090011224150658, "timestamp": "2025-09-04 04:15:16.276637", "step": 3305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:16.379531", "step": 3305, "epoch": 3 }, { "type": "loss", "content": 0.0072241052985191345, "timestamp": "2025-09-04 04:15:16.398754", "step": 3306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:16.499129", "step": 3306, "epoch": 3 }, { "type": "loss", "content": 0.0002566951443441212, "timestamp": "2025-09-04 04:15:16.517889", "step": 3307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:16.596145", "step": 3307, "epoch": 3 }, { "type": "loss", "content": 0.0032684989273548126, "timestamp": "2025-09-04 04:15:16.610992", "step": 3308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:15:16.685266", "step": 3308, "epoch": 3 }, { "type": "loss", "content": 0.0016026400262489915, "timestamp": "2025-09-04 04:15:16.700263", "step": 3309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:16.807189", "step": 3309, "epoch": 3 }, { "type": "loss", "content": 0.002181933494284749, "timestamp": "2025-09-04 04:15:16.827297", "step": 3310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:16.937081", "step": 3310, "epoch": 3 }, { "type": "loss", "content": 0.002524265320971608, "timestamp": "2025-09-04 04:15:16.957733", "step": 3311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:17.068037", "step": 3311, "epoch": 3 }, { "type": "loss", "content": 0.0015753493644297123, "timestamp": "2025-09-04 04:15:17.089325", "step": 3312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:15:17.173509", "step": 3312, "epoch": 3 }, { "type": "loss", "content": 0.010916945524513721, "timestamp": "2025-09-04 04:15:17.190442", "step": 3313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:17.300267", "step": 3313, "epoch": 3 }, { "type": "loss", "content": 0.002287115901708603, "timestamp": "2025-09-04 04:15:17.320788", "step": 3314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:17.411659", "step": 3314, "epoch": 3 }, { "type": "loss", "content": 0.02453738823533058, "timestamp": "2025-09-04 04:15:17.428428", "step": 3315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:15:17.526468", "step": 3315, "epoch": 3 }, { "type": "loss", "content": 0.0024574180133640766, "timestamp": "2025-09-04 04:15:17.545929", "step": 3316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:17.634340", "step": 3316, "epoch": 3 }, { "type": "loss", "content": 0.003129773773252964, "timestamp": "2025-09-04 04:15:17.652692", "step": 3317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:15:17.730020", "step": 3317, "epoch": 3 }, { "type": "loss", "content": 0.0017890299204736948, "timestamp": "2025-09-04 04:15:17.744156", "step": 3318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:17.846641", "step": 3318, "epoch": 3 }, { "type": "loss", "content": 0.0003455560654401779, "timestamp": "2025-09-04 04:15:17.865764", "step": 3319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:17.955724", "step": 3319, "epoch": 3 }, { "type": "loss", "content": 0.028852151706814766, "timestamp": "2025-09-04 04:15:17.973313", "step": 3320, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:26.366823", "step": 3320, "epoch": 3 }, { "type": "pplx", "content": 315.5775818717673, "timestamp": "2025-09-04 04:15:26.368541", "step": 3320, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3320", "timestamp": "2025-09-04 04:15:26.717810", "step": 3320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:15:26.835668", "step": 3320, "epoch": 3 }, { "type": "loss", "content": 0.0002499267866369337, "timestamp": "2025-09-04 04:15:26.860977", "step": 3321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:15:26.945054", "step": 3321, "epoch": 3 }, { "type": "loss", "content": 0.0008101621060632169, "timestamp": "2025-09-04 04:15:26.960654", "step": 3322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:15:27.054780", "step": 3322, "epoch": 3 }, { "type": "loss", "content": 0.012700149789452553, "timestamp": "2025-09-04 04:15:27.072188", "step": 3323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:15:27.172098", "step": 3323, "epoch": 3 }, { "type": "loss", "content": 0.007206479553133249, "timestamp": "2025-09-04 04:15:27.191492", "step": 3324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:27.268978", "step": 3324, "epoch": 3 }, { "type": "loss", "content": 0.013278383761644363, "timestamp": "2025-09-04 04:15:27.284463", "step": 3325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:27.395471", "step": 3325, "epoch": 3 }, { "type": "loss", "content": 0.004693718161433935, "timestamp": "2025-09-04 04:15:27.414655", "step": 3326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:15:27.490883", "step": 3326, "epoch": 3 }, { "type": "loss", "content": 0.029808765277266502, "timestamp": "2025-09-04 04:15:27.504664", "step": 3327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:27.608290", "step": 3327, "epoch": 3 }, { "type": "loss", "content": 0.00016743317246437073, "timestamp": "2025-09-04 04:15:27.628357", "step": 3328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:15:27.703701", "step": 3328, "epoch": 3 }, { "type": "loss", "content": 0.018314022570848465, "timestamp": "2025-09-04 04:15:27.718997", "step": 3329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:27.828684", "step": 3329, "epoch": 3 }, { "type": "loss", "content": 0.0024994483683258295, "timestamp": "2025-09-04 04:15:27.849182", "step": 3330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:15:27.935748", "step": 3330, "epoch": 3 }, { "type": "loss", "content": 0.00011107311001978815, "timestamp": "2025-09-04 04:15:27.951411", "step": 3331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 848 ], "flops": 16960103024960.0 }, "timestamp": "2025-09-04 04:15:28.077014", "step": 3331, "epoch": 3 }, { "type": "loss", "content": 0.010431556962430477, "timestamp": "2025-09-04 04:15:28.101871", "step": 3332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:15:28.193747", "step": 3332, "epoch": 3 }, { "type": "loss", "content": 0.03157045319676399, "timestamp": "2025-09-04 04:15:28.212571", "step": 3333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:15:28.296301", "step": 3333, "epoch": 3 }, { "type": "loss", "content": 0.04083694517612457, "timestamp": "2025-09-04 04:15:28.311522", "step": 3334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:15:28.405898", "step": 3334, "epoch": 3 }, { "type": "loss", "content": 0.0034233976621180773, "timestamp": "2025-09-04 04:15:28.423349", "step": 3335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:28.532837", "step": 3335, "epoch": 3 }, { "type": "loss", "content": 0.021260175853967667, "timestamp": "2025-09-04 04:15:28.554152", "step": 3336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:28.651272", "step": 3336, "epoch": 3 }, { "type": "loss", "content": 0.015738222748041153, "timestamp": "2025-09-04 04:15:28.671979", "step": 3337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:28.775677", "step": 3337, "epoch": 3 }, { "type": "loss", "content": 0.007692721672356129, "timestamp": "2025-09-04 04:15:28.794901", "step": 3338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:28.896271", "step": 3338, "epoch": 3 }, { "type": "loss", "content": 0.0014377308543771505, "timestamp": "2025-09-04 04:15:28.915136", "step": 3339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:28.993680", "step": 3339, "epoch": 3 }, { "type": "loss", "content": 0.016688672825694084, "timestamp": "2025-09-04 04:15:29.008612", "step": 3340, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:37.370468", "step": 3340, "epoch": 3 }, { "type": "pplx", "content": 312.33404172021034, "timestamp": "2025-09-04 04:15:37.372516", "step": 3340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:37.473968", "step": 3340, "epoch": 3 }, { "type": "loss", "content": 0.011500056833028793, "timestamp": "2025-09-04 04:15:37.495856", "step": 3341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:37.606004", "step": 3341, "epoch": 3 }, { "type": "loss", "content": 0.0288741085678339, "timestamp": "2025-09-04 04:15:37.626404", "step": 3342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:37.728252", "step": 3342, "epoch": 3 }, { "type": "loss", "content": 0.004480296280235052, "timestamp": "2025-09-04 04:15:37.747168", "step": 3343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:15:37.842312", "step": 3343, "epoch": 3 }, { "type": "loss", "content": 0.002481586765497923, "timestamp": "2025-09-04 04:15:37.860519", "step": 3344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:37.949491", "step": 3344, "epoch": 3 }, { "type": "loss", "content": 0.013428304344415665, "timestamp": "2025-09-04 04:15:37.967921", "step": 3345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:15:38.045038", "step": 3345, "epoch": 3 }, { "type": "loss", "content": 0.0397348590195179, "timestamp": "2025-09-04 04:15:38.058674", "step": 3346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:15:38.166347", "step": 3346, "epoch": 3 }, { "type": "loss", "content": 0.0024191653355956078, "timestamp": "2025-09-04 04:15:38.186670", "step": 3347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:15:38.302749", "step": 3347, "epoch": 3 }, { "type": "loss", "content": 0.0025882297195494175, "timestamp": "2025-09-04 04:15:38.325668", "step": 3348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:15:38.438407", "step": 3348, "epoch": 3 }, { "type": "loss", "content": 0.002427577506750822, "timestamp": "2025-09-04 04:15:38.461108", "step": 3349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:15:38.539248", "step": 3349, "epoch": 3 }, { "type": "loss", "content": 0.003565673716366291, "timestamp": "2025-09-04 04:15:38.553261", "step": 3350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:38.643791", "step": 3350, "epoch": 3 }, { "type": "loss", "content": 0.0014742841012775898, "timestamp": "2025-09-04 04:15:38.660494", "step": 3351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:15:38.770850", "step": 3351, "epoch": 3 }, { "type": "loss", "content": 0.0006036367267370224, "timestamp": "2025-09-04 04:15:38.792263", "step": 3352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:38.889981", "step": 3352, "epoch": 3 }, { "type": "loss", "content": 0.023486964404582977, "timestamp": "2025-09-04 04:15:38.910660", "step": 3353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:15:39.018492", "step": 3353, "epoch": 3 }, { "type": "loss", "content": 0.008537087589502335, "timestamp": "2025-09-04 04:15:39.038759", "step": 3354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:15:39.122884", "step": 3354, "epoch": 3 }, { "type": "loss", "content": 0.0004986139247193933, "timestamp": "2025-09-04 04:15:39.138020", "step": 3355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:39.245800", "step": 3355, "epoch": 3 }, { "type": "loss", "content": 0.03504369035363197, "timestamp": "2025-09-04 04:15:39.266945", "step": 3356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:15:39.343131", "step": 3356, "epoch": 3 }, { "type": "loss", "content": 0.017468314617872238, "timestamp": "2025-09-04 04:15:39.358533", "step": 3357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:39.461610", "step": 3357, "epoch": 3 }, { "type": "loss", "content": 0.005034047178924084, "timestamp": "2025-09-04 04:15:39.480862", "step": 3358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:15:39.566712", "step": 3358, "epoch": 3 }, { "type": "loss", "content": 0.031177129596471786, "timestamp": "2025-09-04 04:15:39.582311", "step": 3359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:15:39.684803", "step": 3359, "epoch": 3 }, { "type": "loss", "content": 0.0004434007278177887, "timestamp": "2025-09-04 04:15:39.704763", "step": 3360, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:48.145682", "step": 3360, "epoch": 3 }, { "type": "pplx", "content": 301.0731658055851, "timestamp": "2025-09-04 04:15:48.147942", "step": 3360, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3360", "timestamp": "2025-09-04 04:15:48.687287", "step": 3360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:15:48.767367", "step": 3360, "epoch": 3 }, { "type": "loss", "content": 0.0026787512470036745, "timestamp": "2025-09-04 04:15:48.783668", "step": 3361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:48.896544", "step": 3361, "epoch": 3 }, { "type": "loss", "content": 0.05144086107611656, "timestamp": "2025-09-04 04:15:48.916900", "step": 3362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:15:49.012988", "step": 3362, "epoch": 3 }, { "type": "loss", "content": 0.00442865677177906, "timestamp": "2025-09-04 04:15:49.030271", "step": 3363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:49.131246", "step": 3363, "epoch": 3 }, { "type": "loss", "content": 0.0024153843987733126, "timestamp": "2025-09-04 04:15:49.150792", "step": 3364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:15:49.247227", "step": 3364, "epoch": 3 }, { "type": "loss", "content": 0.00037681308458559215, "timestamp": "2025-09-04 04:15:49.267476", "step": 3365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:15:49.361875", "step": 3365, "epoch": 3 }, { "type": "loss", "content": 0.004885226022452116, "timestamp": "2025-09-04 04:15:49.378943", "step": 3366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:49.479578", "step": 3366, "epoch": 3 }, { "type": "loss", "content": 0.020293110981583595, "timestamp": "2025-09-04 04:15:49.498362", "step": 3367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:15:49.601224", "step": 3367, "epoch": 3 }, { "type": "loss", "content": 0.02302255854010582, "timestamp": "2025-09-04 04:15:49.620988", "step": 3368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:15:49.727729", "step": 3368, "epoch": 3 }, { "type": "loss", "content": 0.0001401986082782969, "timestamp": "2025-09-04 04:15:49.750154", "step": 3369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:15:49.859459", "step": 3369, "epoch": 3 }, { "type": "loss", "content": 0.03139295056462288, "timestamp": "2025-09-04 04:15:49.879856", "step": 3370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:49.981889", "step": 3370, "epoch": 3 }, { "type": "loss", "content": 0.000647324079181999, "timestamp": "2025-09-04 04:15:50.000762", "step": 3371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:15:50.071999", "step": 3371, "epoch": 3 }, { "type": "loss", "content": 0.020890971645712852, "timestamp": "2025-09-04 04:15:50.085529", "step": 3372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:15:50.181131", "step": 3372, "epoch": 3 }, { "type": "loss", "content": 0.0002276118320878595, "timestamp": "2025-09-04 04:15:50.200295", "step": 3373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:15:50.306852", "step": 3373, "epoch": 3 }, { "type": "loss", "content": 0.021090731024742126, "timestamp": "2025-09-04 04:15:50.326971", "step": 3374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:15:50.463072", "step": 3374, "epoch": 3 }, { "type": "loss", "content": 0.0053464737720787525, "timestamp": "2025-09-04 04:15:50.489249", "step": 3375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:15:50.600962", "step": 3375, "epoch": 3 }, { "type": "loss", "content": 0.00169714679941535, "timestamp": "2025-09-04 04:15:50.622394", "step": 3376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:15:50.705172", "step": 3376, "epoch": 3 }, { "type": "loss", "content": 0.0071422443725168705, "timestamp": "2025-09-04 04:15:50.722252", "step": 3377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:15:50.812122", "step": 3377, "epoch": 3 }, { "type": "loss", "content": 0.0026480015367269516, "timestamp": "2025-09-04 04:15:50.828896", "step": 3378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:50.928878", "step": 3378, "epoch": 3 }, { "type": "loss", "content": 0.0017498015658929944, "timestamp": "2025-09-04 04:15:50.947886", "step": 3379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:51.048104", "step": 3379, "epoch": 3 }, { "type": "loss", "content": 0.001030558254569769, "timestamp": "2025-09-04 04:15:51.067707", "step": 3380, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:15:59.446548", "step": 3380, "epoch": 3 }, { "type": "pplx", "content": 294.73432370067934, "timestamp": "2025-09-04 04:15:59.448500", "step": 3380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:15:59.530511", "step": 3380, "epoch": 3 }, { "type": "loss", "content": 0.00033817789517343044, "timestamp": "2025-09-04 04:15:59.547754", "step": 3381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:15:59.648370", "step": 3381, "epoch": 3 }, { "type": "loss", "content": 0.0006461319862864912, "timestamp": "2025-09-04 04:15:59.667244", "step": 3382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:15:59.752986", "step": 3382, "epoch": 3 }, { "type": "loss", "content": 0.027155397459864616, "timestamp": "2025-09-04 04:15:59.768515", "step": 3383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:15:59.866742", "step": 3383, "epoch": 3 }, { "type": "loss", "content": 0.07015629857778549, "timestamp": "2025-09-04 04:15:59.884939", "step": 3384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:15:59.998615", "step": 3384, "epoch": 3 }, { "type": "loss", "content": 0.0016538852360099554, "timestamp": "2025-09-04 04:16:00.022906", "step": 3385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:00.122185", "step": 3385, "epoch": 3 }, { "type": "loss", "content": 0.11791013926267624, "timestamp": "2025-09-04 04:16:00.140724", "step": 3386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:16:00.228744", "step": 3386, "epoch": 3 }, { "type": "loss", "content": 0.014148048125207424, "timestamp": "2025-09-04 04:16:00.244329", "step": 3387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:16:00.365436", "step": 3387, "epoch": 3 }, { "type": "loss", "content": 0.0005752384895458817, "timestamp": "2025-09-04 04:16:00.389346", "step": 3388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:00.490559", "step": 3388, "epoch": 3 }, { "type": "loss", "content": 0.0020116129890084267, "timestamp": "2025-09-04 04:16:00.511811", "step": 3389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:00.612654", "step": 3389, "epoch": 3 }, { "type": "loss", "content": 0.015090583823621273, "timestamp": "2025-09-04 04:16:00.631434", "step": 3390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:16:00.726083", "step": 3390, "epoch": 3 }, { "type": "loss", "content": 0.0049343351274728775, "timestamp": "2025-09-04 04:16:00.743678", "step": 3391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:00.843939", "step": 3391, "epoch": 3 }, { "type": "loss", "content": 0.000529682612977922, "timestamp": "2025-09-04 04:16:00.863659", "step": 3392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:16:00.956813", "step": 3392, "epoch": 3 }, { "type": "loss", "content": 0.0011004252592101693, "timestamp": "2025-09-04 04:16:00.976139", "step": 3393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:01.070353", "step": 3393, "epoch": 3 }, { "type": "loss", "content": 0.02100779488682747, "timestamp": "2025-09-04 04:16:01.087563", "step": 3394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:01.187918", "step": 3394, "epoch": 3 }, { "type": "loss", "content": 0.010034440085291862, "timestamp": "2025-09-04 04:16:01.206884", "step": 3395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:01.299913", "step": 3395, "epoch": 3 }, { "type": "loss", "content": 0.04920711740851402, "timestamp": "2025-09-04 04:16:01.317898", "step": 3396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:01.423832", "step": 3396, "epoch": 3 }, { "type": "loss", "content": 0.041153181344270706, "timestamp": "2025-09-04 04:16:01.445643", "step": 3397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:01.549800", "step": 3397, "epoch": 3 }, { "type": "loss", "content": 0.02985711395740509, "timestamp": "2025-09-04 04:16:01.569180", "step": 3398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:01.668606", "step": 3398, "epoch": 3 }, { "type": "loss", "content": 0.000561038323212415, "timestamp": "2025-09-04 04:16:01.687288", "step": 3399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:01.787984", "step": 3399, "epoch": 3 }, { "type": "loss", "content": 0.002825426869094372, "timestamp": "2025-09-04 04:16:01.807598", "step": 3400, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:16:10.365512", "step": 3400, "epoch": 3 }, { "type": "pplx", "content": 285.49062028489874, "timestamp": "2025-09-04 04:16:10.371170", "step": 3400, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3400", "timestamp": "2025-09-04 04:16:10.757804", "step": 3400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:10.846129", "step": 3400, "epoch": 3 }, { "type": "loss", "content": 0.002441899385303259, "timestamp": "2025-09-04 04:16:10.864175", "step": 3401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:16:10.962424", "step": 3401, "epoch": 3 }, { "type": "loss", "content": 0.0007989015430212021, "timestamp": "2025-09-04 04:16:10.979656", "step": 3402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:11.084061", "step": 3402, "epoch": 3 }, { "type": "loss", "content": 0.0034658664371818304, "timestamp": "2025-09-04 04:16:11.103037", "step": 3403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:16:11.242472", "step": 3403, "epoch": 3 }, { "type": "loss", "content": 0.011140529066324234, "timestamp": "2025-09-04 04:16:11.269313", "step": 3404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:11.359769", "step": 3404, "epoch": 3 }, { "type": "loss", "content": 0.007334363646805286, "timestamp": "2025-09-04 04:16:11.377858", "step": 3405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:11.470040", "step": 3405, "epoch": 3 }, { "type": "loss", "content": 0.007528170011937618, "timestamp": "2025-09-04 04:16:11.486570", "step": 3406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:11.591013", "step": 3406, "epoch": 3 }, { "type": "loss", "content": 0.0002277525927638635, "timestamp": "2025-09-04 04:16:11.609976", "step": 3407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:11.702970", "step": 3407, "epoch": 3 }, { "type": "loss", "content": 0.0009056737762875855, "timestamp": "2025-09-04 04:16:11.720292", "step": 3408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:11.822714", "step": 3408, "epoch": 3 }, { "type": "loss", "content": 0.007609906140714884, "timestamp": "2025-09-04 04:16:11.843614", "step": 3409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:11.949959", "step": 3409, "epoch": 3 }, { "type": "loss", "content": 0.008335032500326633, "timestamp": "2025-09-04 04:16:11.969001", "step": 3410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:16:12.048036", "step": 3410, "epoch": 3 }, { "type": "loss", "content": 0.0005043984274379909, "timestamp": "2025-09-04 04:16:12.061607", "step": 3411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:12.174289", "step": 3411, "epoch": 3 }, { "type": "loss", "content": 0.005011504516005516, "timestamp": "2025-09-04 04:16:12.195379", "step": 3412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:12.294306", "step": 3412, "epoch": 3 }, { "type": "loss", "content": 0.030210444703698158, "timestamp": "2025-09-04 04:16:12.314552", "step": 3413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:12.420677", "step": 3413, "epoch": 3 }, { "type": "loss", "content": 0.0011688501108437777, "timestamp": "2025-09-04 04:16:12.439751", "step": 3414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:12.554307", "step": 3414, "epoch": 3 }, { "type": "loss", "content": 0.014814517460763454, "timestamp": "2025-09-04 04:16:12.573324", "step": 3415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:12.676000", "step": 3415, "epoch": 3 }, { "type": "loss", "content": 0.022736500948667526, "timestamp": "2025-09-04 04:16:12.695440", "step": 3416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:12.797542", "step": 3416, "epoch": 3 }, { "type": "loss", "content": 0.0009403983131051064, "timestamp": "2025-09-04 04:16:12.818416", "step": 3417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:16:12.930686", "step": 3417, "epoch": 3 }, { "type": "loss", "content": 0.003918331582099199, "timestamp": "2025-09-04 04:16:12.951167", "step": 3418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:16:13.070130", "step": 3418, "epoch": 3 }, { "type": "loss", "content": 0.002362527186051011, "timestamp": "2025-09-04 04:16:13.092009", "step": 3419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:13.199697", "step": 3419, "epoch": 3 }, { "type": "loss", "content": 0.011338985525071621, "timestamp": "2025-09-04 04:16:13.220218", "step": 3420, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:16:21.725284", "step": 3420, "epoch": 3 }, { "type": "pplx", "content": 281.34383504857846, "timestamp": "2025-09-04 04:16:21.727582", "step": 3420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:21.826125", "step": 3420, "epoch": 3 }, { "type": "loss", "content": 0.003105961252003908, "timestamp": "2025-09-04 04:16:21.847337", "step": 3421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:16:21.932865", "step": 3421, "epoch": 3 }, { "type": "loss", "content": 0.0025257884990423918, "timestamp": "2025-09-04 04:16:21.948610", "step": 3422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:22.040843", "step": 3422, "epoch": 3 }, { "type": "loss", "content": 0.0005112270591780543, "timestamp": "2025-09-04 04:16:22.057951", "step": 3423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 04:16:22.241785", "step": 3423, "epoch": 3 }, { "type": "loss", "content": 0.012885574251413345, "timestamp": "2025-09-04 04:16:22.277227", "step": 3424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:22.368483", "step": 3424, "epoch": 3 }, { "type": "loss", "content": 0.007698435802012682, "timestamp": "2025-09-04 04:16:22.387279", "step": 3425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:22.496149", "step": 3425, "epoch": 3 }, { "type": "loss", "content": 0.0006365908775478601, "timestamp": "2025-09-04 04:16:22.516680", "step": 3426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:22.619441", "step": 3426, "epoch": 3 }, { "type": "loss", "content": 0.026056919246912003, "timestamp": "2025-09-04 04:16:22.638651", "step": 3427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:16:22.722696", "step": 3427, "epoch": 3 }, { "type": "loss", "content": 0.0844995528459549, "timestamp": "2025-09-04 04:16:22.738555", "step": 3428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:16:22.822651", "step": 3428, "epoch": 3 }, { "type": "loss", "content": 0.0028854222036898136, "timestamp": "2025-09-04 04:16:22.839858", "step": 3429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:22.929402", "step": 3429, "epoch": 3 }, { "type": "loss", "content": 0.000976667390204966, "timestamp": "2025-09-04 04:16:22.946348", "step": 3430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:23.044574", "step": 3430, "epoch": 3 }, { "type": "loss", "content": 0.019412081688642502, "timestamp": "2025-09-04 04:16:23.063135", "step": 3431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:23.162180", "step": 3431, "epoch": 3 }, { "type": "loss", "content": 0.0002543810987845063, "timestamp": "2025-09-04 04:16:23.181583", "step": 3432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1248 ], "flops": 24960151589760.0 }, "timestamp": "2025-09-04 04:16:23.360867", "step": 3432, "epoch": 3 }, { "type": "loss", "content": 0.0077868239022791386, "timestamp": "2025-09-04 04:16:23.398858", "step": 3433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:23.498788", "step": 3433, "epoch": 3 }, { "type": "loss", "content": 0.0018246417166665196, "timestamp": "2025-09-04 04:16:23.517327", "step": 3434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:16:23.594959", "step": 3434, "epoch": 3 }, { "type": "loss", "content": 0.03193148225545883, "timestamp": "2025-09-04 04:16:23.609001", "step": 3435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:23.711499", "step": 3435, "epoch": 3 }, { "type": "loss", "content": 0.0072656129486858845, "timestamp": "2025-09-04 04:16:23.731140", "step": 3436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:23.830096", "step": 3436, "epoch": 3 }, { "type": "loss", "content": 0.0032099627424031496, "timestamp": "2025-09-04 04:16:23.850782", "step": 3437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:16:23.945449", "step": 3437, "epoch": 3 }, { "type": "loss", "content": 0.004939934704452753, "timestamp": "2025-09-04 04:16:23.962953", "step": 3438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:24.065224", "step": 3438, "epoch": 3 }, { "type": "loss", "content": 0.003717708634212613, "timestamp": "2025-09-04 04:16:24.084445", "step": 3439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:24.192910", "step": 3439, "epoch": 3 }, { "type": "loss", "content": 0.010961350053548813, "timestamp": "2025-09-04 04:16:24.214147", "step": 3440, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:16:32.619010", "step": 3440, "epoch": 3 }, { "type": "pplx", "content": 276.0165633644801, "timestamp": "2025-09-04 04:16:32.621440", "step": 3440, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3440", "timestamp": "2025-09-04 04:16:33.156281", "step": 3440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:33.245723", "step": 3440, "epoch": 3 }, { "type": "loss", "content": 0.0005490960320457816, "timestamp": "2025-09-04 04:16:33.264393", "step": 3441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:16:33.338981", "step": 3441, "epoch": 3 }, { "type": "loss", "content": 0.0026552164927124977, "timestamp": "2025-09-04 04:16:33.352587", "step": 3442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:16:33.460260", "step": 3442, "epoch": 3 }, { "type": "loss", "content": 0.0013740723952651024, "timestamp": "2025-09-04 04:16:33.480498", "step": 3443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:33.590052", "step": 3443, "epoch": 3 }, { "type": "loss", "content": 0.005307029001414776, "timestamp": "2025-09-04 04:16:33.611410", "step": 3444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:33.702176", "step": 3444, "epoch": 3 }, { "type": "loss", "content": 0.018815629184246063, "timestamp": "2025-09-04 04:16:33.720515", "step": 3445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:33.821133", "step": 3445, "epoch": 3 }, { "type": "loss", "content": 0.0011458718217909336, "timestamp": "2025-09-04 04:16:33.839998", "step": 3446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1408 ], "flops": 28160171015680.0 }, "timestamp": "2025-09-04 04:16:34.047495", "step": 3446, "epoch": 3 }, { "type": "loss", "content": 0.0004305221955291927, "timestamp": "2025-09-04 04:16:34.086809", "step": 3447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:34.188269", "step": 3447, "epoch": 3 }, { "type": "loss", "content": 0.004843763541430235, "timestamp": "2025-09-04 04:16:34.208262", "step": 3448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:16:34.314432", "step": 3448, "epoch": 3 }, { "type": "loss", "content": 0.005847205873578787, "timestamp": "2025-09-04 04:16:34.337045", "step": 3449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:16:34.419987", "step": 3449, "epoch": 3 }, { "type": "loss", "content": 0.025259602814912796, "timestamp": "2025-09-04 04:16:34.435272", "step": 3450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:16:34.518599", "step": 3450, "epoch": 3 }, { "type": "loss", "content": 0.003980662208050489, "timestamp": "2025-09-04 04:16:34.533632", "step": 3451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:16:34.627948", "step": 3451, "epoch": 3 }, { "type": "loss", "content": 0.0030712997540831566, "timestamp": "2025-09-04 04:16:34.646239", "step": 3452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:34.748110", "step": 3452, "epoch": 3 }, { "type": "loss", "content": 0.049204930663108826, "timestamp": "2025-09-04 04:16:34.769245", "step": 3453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:34.860524", "step": 3453, "epoch": 3 }, { "type": "loss", "content": 0.00023195317771751434, "timestamp": "2025-09-04 04:16:34.877272", "step": 3454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:34.976625", "step": 3454, "epoch": 3 }, { "type": "loss", "content": 0.005655570421367884, "timestamp": "2025-09-04 04:16:34.995186", "step": 3455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:16:35.072036", "step": 3455, "epoch": 3 }, { "type": "loss", "content": 0.009977075271308422, "timestamp": "2025-09-04 04:16:35.086890", "step": 3456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:35.177118", "step": 3456, "epoch": 3 }, { "type": "loss", "content": 0.017296381294727325, "timestamp": "2025-09-04 04:16:35.195820", "step": 3457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:35.286844", "step": 3457, "epoch": 3 }, { "type": "loss", "content": 0.008046641014516354, "timestamp": "2025-09-04 04:16:35.303590", "step": 3458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:35.412460", "step": 3458, "epoch": 3 }, { "type": "loss", "content": 0.03136194869875908, "timestamp": "2025-09-04 04:16:35.432984", "step": 3459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:35.536842", "step": 3459, "epoch": 3 }, { "type": "loss", "content": 0.00904142763465643, "timestamp": "2025-09-04 04:16:35.556939", "step": 3460, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:16:43.938279", "step": 3460, "epoch": 3 }, { "type": "pplx", "content": 273.344454429984, "timestamp": "2025-09-04 04:16:43.940762", "step": 3460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:44.042144", "step": 3460, "epoch": 3 }, { "type": "loss", "content": 0.006052535958588123, "timestamp": "2025-09-04 04:16:44.064039", "step": 3461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:44.167096", "step": 3461, "epoch": 3 }, { "type": "loss", "content": 0.0018702381057664752, "timestamp": "2025-09-04 04:16:44.186364", "step": 3462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:44.289805", "step": 3462, "epoch": 3 }, { "type": "loss", "content": 0.006505020894110203, "timestamp": "2025-09-04 04:16:44.309039", "step": 3463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:44.414600", "step": 3463, "epoch": 3 }, { "type": "loss", "content": 0.006749553140252829, "timestamp": "2025-09-04 04:16:44.435284", "step": 3464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:44.527480", "step": 3464, "epoch": 3 }, { "type": "loss", "content": 0.011513489298522472, "timestamp": "2025-09-04 04:16:44.546415", "step": 3465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:44.649578", "step": 3465, "epoch": 3 }, { "type": "loss", "content": 0.0009676741319708526, "timestamp": "2025-09-04 04:16:44.668399", "step": 3466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 04:16:44.799523", "step": 3466, "epoch": 3 }, { "type": "loss", "content": 0.0007584612467326224, "timestamp": "2025-09-04 04:16:44.824173", "step": 3467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:44.925235", "step": 3467, "epoch": 3 }, { "type": "loss", "content": 0.051189690828323364, "timestamp": "2025-09-04 04:16:44.944884", "step": 3468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:45.043356", "step": 3468, "epoch": 3 }, { "type": "loss", "content": 0.004076420795172453, "timestamp": "2025-09-04 04:16:45.063783", "step": 3469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:45.169858", "step": 3469, "epoch": 3 }, { "type": "loss", "content": 0.01913578435778618, "timestamp": "2025-09-04 04:16:45.189959", "step": 3470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:16:45.283963", "step": 3470, "epoch": 3 }, { "type": "loss", "content": 0.0021679247729480267, "timestamp": "2025-09-04 04:16:45.301246", "step": 3471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:16:45.397107", "step": 3471, "epoch": 3 }, { "type": "loss", "content": 0.00030914912349544466, "timestamp": "2025-09-04 04:16:45.413267", "step": 3472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:16:45.495736", "step": 3472, "epoch": 3 }, { "type": "loss", "content": 0.0001749959192238748, "timestamp": "2025-09-04 04:16:45.512249", "step": 3473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:45.616646", "step": 3473, "epoch": 3 }, { "type": "loss", "content": 0.008923078887164593, "timestamp": "2025-09-04 04:16:45.635799", "step": 3474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:45.742204", "step": 3474, "epoch": 3 }, { "type": "loss", "content": 0.01262232568114996, "timestamp": "2025-09-04 04:16:45.762090", "step": 3475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:45.864266", "step": 3475, "epoch": 3 }, { "type": "loss", "content": 0.005357048008590937, "timestamp": "2025-09-04 04:16:45.883695", "step": 3476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:45.990063", "step": 3476, "epoch": 3 }, { "type": "loss", "content": 0.0005004971753805876, "timestamp": "2025-09-04 04:16:46.012610", "step": 3477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:46.112508", "step": 3477, "epoch": 3 }, { "type": "loss", "content": 0.004670882131904364, "timestamp": "2025-09-04 04:16:46.131186", "step": 3478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:46.240850", "step": 3478, "epoch": 3 }, { "type": "loss", "content": 0.001415494829416275, "timestamp": "2025-09-04 04:16:46.261391", "step": 3479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:46.354980", "step": 3479, "epoch": 3 }, { "type": "loss", "content": 0.0019339878344908357, "timestamp": "2025-09-04 04:16:46.372907", "step": 3480, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:16:54.758650", "step": 3480, "epoch": 3 }, { "type": "pplx", "content": 273.04229426249503, "timestamp": "2025-09-04 04:16:54.760818", "step": 3480, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3480", "timestamp": "2025-09-04 04:16:55.106808", "step": 3480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:55.197784", "step": 3480, "epoch": 3 }, { "type": "loss", "content": 0.007648364640772343, "timestamp": "2025-09-04 04:16:55.216546", "step": 3481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:16:55.311636", "step": 3481, "epoch": 3 }, { "type": "loss", "content": 0.003384833922609687, "timestamp": "2025-09-04 04:16:55.329096", "step": 3482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:55.421525", "step": 3482, "epoch": 3 }, { "type": "loss", "content": 0.0017845932161435485, "timestamp": "2025-09-04 04:16:55.438702", "step": 3483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:55.542236", "step": 3483, "epoch": 3 }, { "type": "loss", "content": 0.030296683311462402, "timestamp": "2025-09-04 04:16:55.562301", "step": 3484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:55.659953", "step": 3484, "epoch": 3 }, { "type": "loss", "content": 0.010239890776574612, "timestamp": "2025-09-04 04:16:55.680631", "step": 3485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:16:55.773577", "step": 3485, "epoch": 3 }, { "type": "loss", "content": 0.01184002310037613, "timestamp": "2025-09-04 04:16:55.790695", "step": 3486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:55.896552", "step": 3486, "epoch": 3 }, { "type": "loss", "content": 0.0026469461154192686, "timestamp": "2025-09-04 04:16:55.916558", "step": 3487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:56.020152", "step": 3487, "epoch": 3 }, { "type": "loss", "content": 0.006876929197460413, "timestamp": "2025-09-04 04:16:56.040234", "step": 3488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:16:56.147712", "step": 3488, "epoch": 3 }, { "type": "loss", "content": 0.012749651446938515, "timestamp": "2025-09-04 04:16:56.170254", "step": 3489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:56.273664", "step": 3489, "epoch": 3 }, { "type": "loss", "content": 0.00782372523099184, "timestamp": "2025-09-04 04:16:56.292933", "step": 3490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:16:56.378232", "step": 3490, "epoch": 3 }, { "type": "loss", "content": 0.0010486006503924727, "timestamp": "2025-09-04 04:16:56.393419", "step": 3491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:56.493707", "step": 3491, "epoch": 3 }, { "type": "loss", "content": 0.0012736011995002627, "timestamp": "2025-09-04 04:16:56.513374", "step": 3492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:16:56.611057", "step": 3492, "epoch": 3 }, { "type": "loss", "content": 0.019962089136242867, "timestamp": "2025-09-04 04:16:56.631424", "step": 3493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:16:56.734929", "step": 3493, "epoch": 3 }, { "type": "loss", "content": 0.016266385093331337, "timestamp": "2025-09-04 04:16:56.754022", "step": 3494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:56.856315", "step": 3494, "epoch": 3 }, { "type": "loss", "content": 0.004476721398532391, "timestamp": "2025-09-04 04:16:56.875280", "step": 3495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:16:56.975567", "step": 3495, "epoch": 3 }, { "type": "loss", "content": 0.0004053797747474164, "timestamp": "2025-09-04 04:16:56.995298", "step": 3496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:57.095271", "step": 3496, "epoch": 3 }, { "type": "loss", "content": 0.016851192340254784, "timestamp": "2025-09-04 04:16:57.116407", "step": 3497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:16:57.223991", "step": 3497, "epoch": 3 }, { "type": "loss", "content": 0.002290160395205021, "timestamp": "2025-09-04 04:16:57.244053", "step": 3498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:16:57.334762", "step": 3498, "epoch": 3 }, { "type": "loss", "content": 0.009426870383322239, "timestamp": "2025-09-04 04:16:57.351647", "step": 3499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:16:57.454684", "step": 3499, "epoch": 3 }, { "type": "loss", "content": 0.010544263757765293, "timestamp": "2025-09-04 04:16:57.474898", "step": 3500, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:17:05.851374", "step": 3500, "epoch": 3 }, { "type": "pplx", "content": 278.8655476522292, "timestamp": "2025-09-04 04:17:05.854405", "step": 3500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:05.952478", "step": 3500, "epoch": 3 }, { "type": "loss", "content": 0.006872169207781553, "timestamp": "2025-09-04 04:17:05.973642", "step": 3501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:06.067259", "step": 3501, "epoch": 3 }, { "type": "loss", "content": 0.010031554847955704, "timestamp": "2025-09-04 04:17:06.084553", "step": 3502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:06.174657", "step": 3502, "epoch": 3 }, { "type": "loss", "content": 0.005660749971866608, "timestamp": "2025-09-04 04:17:06.191462", "step": 3503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:17:06.276212", "step": 3503, "epoch": 3 }, { "type": "loss", "content": 0.009915877133607864, "timestamp": "2025-09-04 04:17:06.292393", "step": 3504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:06.383845", "step": 3504, "epoch": 3 }, { "type": "loss", "content": 0.004943343810737133, "timestamp": "2025-09-04 04:17:06.403017", "step": 3505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:06.504669", "step": 3505, "epoch": 3 }, { "type": "loss", "content": 0.0015765562420710921, "timestamp": "2025-09-04 04:17:06.523796", "step": 3506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:06.616657", "step": 3506, "epoch": 3 }, { "type": "loss", "content": 0.0038354985881596804, "timestamp": "2025-09-04 04:17:06.633845", "step": 3507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:06.750740", "step": 3507, "epoch": 3 }, { "type": "loss", "content": 0.0014609359204769135, "timestamp": "2025-09-04 04:17:06.770698", "step": 3508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:06.873650", "step": 3508, "epoch": 3 }, { "type": "loss", "content": 0.002754951361566782, "timestamp": "2025-09-04 04:17:06.895603", "step": 3509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:06.985320", "step": 3509, "epoch": 3 }, { "type": "loss", "content": 0.05803408473730087, "timestamp": "2025-09-04 04:17:07.002134", "step": 3510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:07.104670", "step": 3510, "epoch": 3 }, { "type": "loss", "content": 0.002227720571681857, "timestamp": "2025-09-04 04:17:07.123858", "step": 3511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:07.224100", "step": 3511, "epoch": 3 }, { "type": "loss", "content": 0.00011479722161311656, "timestamp": "2025-09-04 04:17:07.243742", "step": 3512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:07.334422", "step": 3512, "epoch": 3 }, { "type": "loss", "content": 0.0112196309491992, "timestamp": "2025-09-04 04:17:07.353494", "step": 3513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:07.447008", "step": 3513, "epoch": 3 }, { "type": "loss", "content": 0.00041249426431022584, "timestamp": "2025-09-04 04:17:07.464442", "step": 3514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:07.567901", "step": 3514, "epoch": 3 }, { "type": "loss", "content": 0.003908245358616114, "timestamp": "2025-09-04 04:17:07.587156", "step": 3515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:07.678752", "step": 3515, "epoch": 3 }, { "type": "loss", "content": 0.04177214950323105, "timestamp": "2025-09-04 04:17:07.696276", "step": 3516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:07.795192", "step": 3516, "epoch": 3 }, { "type": "loss", "content": 0.060547590255737305, "timestamp": "2025-09-04 04:17:07.815910", "step": 3517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:07.917711", "step": 3517, "epoch": 3 }, { "type": "loss", "content": 0.0007211702759377658, "timestamp": "2025-09-04 04:17:07.936836", "step": 3518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:17:08.071144", "step": 3518, "epoch": 3 }, { "type": "loss", "content": 0.0037805659230798483, "timestamp": "2025-09-04 04:17:08.097057", "step": 3519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:08.190982", "step": 3519, "epoch": 3 }, { "type": "loss", "content": 0.007106819190084934, "timestamp": "2025-09-04 04:17:08.209152", "step": 3520, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:17:16.652692", "step": 3520, "epoch": 3 }, { "type": "pplx", "content": 285.4826197362545, "timestamp": "2025-09-04 04:17:16.656011", "step": 3520, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3520", "timestamp": "2025-09-04 04:17:17.176950", "step": 3520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:17.280941", "step": 3520, "epoch": 3 }, { "type": "loss", "content": 0.00228358106687665, "timestamp": "2025-09-04 04:17:17.302185", "step": 3521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:17:17.387962", "step": 3521, "epoch": 3 }, { "type": "loss", "content": 0.005259730387479067, "timestamp": "2025-09-04 04:17:17.401639", "step": 3522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:17.494562", "step": 3522, "epoch": 3 }, { "type": "loss", "content": 0.0028661582618951797, "timestamp": "2025-09-04 04:17:17.511807", "step": 3523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:17:17.646766", "step": 3523, "epoch": 3 }, { "type": "loss", "content": 0.0053170472383499146, "timestamp": "2025-09-04 04:17:17.673610", "step": 3524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:17.776209", "step": 3524, "epoch": 3 }, { "type": "loss", "content": 0.010155066847801208, "timestamp": "2025-09-04 04:17:17.797350", "step": 3525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:17:17.893899", "step": 3525, "epoch": 3 }, { "type": "loss", "content": 0.011916323564946651, "timestamp": "2025-09-04 04:17:17.911517", "step": 3526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:18.002884", "step": 3526, "epoch": 3 }, { "type": "loss", "content": 0.0006236597546376288, "timestamp": "2025-09-04 04:17:18.019750", "step": 3527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:17:18.095835", "step": 3527, "epoch": 3 }, { "type": "loss", "content": 0.007811339106410742, "timestamp": "2025-09-04 04:17:18.110506", "step": 3528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:17:18.207549", "step": 3528, "epoch": 3 }, { "type": "loss", "content": 0.0019810060039162636, "timestamp": "2025-09-04 04:17:18.227876", "step": 3529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:17:18.339268", "step": 3529, "epoch": 3 }, { "type": "loss", "content": 0.0015669839922338724, "timestamp": "2025-09-04 04:17:18.359806", "step": 3530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:18.457156", "step": 3530, "epoch": 3 }, { "type": "loss", "content": 0.022024275735020638, "timestamp": "2025-09-04 04:17:18.474719", "step": 3531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:17:18.606447", "step": 3531, "epoch": 3 }, { "type": "loss", "content": 0.00185906991828233, "timestamp": "2025-09-04 04:17:18.630444", "step": 3532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:18.730940", "step": 3532, "epoch": 3 }, { "type": "loss", "content": 0.0033666298259049654, "timestamp": "2025-09-04 04:17:18.752102", "step": 3533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 04:17:18.926745", "step": 3533, "epoch": 3 }, { "type": "loss", "content": 0.0011104767909273505, "timestamp": "2025-09-04 04:17:18.959381", "step": 3534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:19.067563", "step": 3534, "epoch": 3 }, { "type": "loss", "content": 0.0017678681761026382, "timestamp": "2025-09-04 04:17:19.087547", "step": 3535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:19.195375", "step": 3535, "epoch": 3 }, { "type": "loss", "content": 0.003609925974160433, "timestamp": "2025-09-04 04:17:19.216154", "step": 3536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:19.304973", "step": 3536, "epoch": 3 }, { "type": "loss", "content": 0.021984629333019257, "timestamp": "2025-09-04 04:17:19.323469", "step": 3537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:19.429598", "step": 3537, "epoch": 3 }, { "type": "loss", "content": 0.00020458322251215577, "timestamp": "2025-09-04 04:17:19.449567", "step": 3538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:17:19.535223", "step": 3538, "epoch": 3 }, { "type": "loss", "content": 0.01300242729485035, "timestamp": "2025-09-04 04:17:19.550684", "step": 3539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:19.667433", "step": 3539, "epoch": 3 }, { "type": "loss", "content": 0.01605132967233658, "timestamp": "2025-09-04 04:17:19.688333", "step": 3540, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:17:28.163839", "step": 3540, "epoch": 3 }, { "type": "pplx", "content": 287.8090877611623, "timestamp": "2025-09-04 04:17:28.165953", "step": 3540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:28.267471", "step": 3540, "epoch": 3 }, { "type": "loss", "content": 0.0015539666637778282, "timestamp": "2025-09-04 04:17:28.289355", "step": 3541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:17:28.493842", "step": 3541, "epoch": 3 }, { "type": "loss", "content": 0.029507221654057503, "timestamp": "2025-09-04 04:17:28.532966", "step": 3542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:17:28.612593", "step": 3542, "epoch": 3 }, { "type": "loss", "content": 0.0015970682725310326, "timestamp": "2025-09-04 04:17:28.626758", "step": 3543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:17:28.744337", "step": 3543, "epoch": 3 }, { "type": "loss", "content": 0.008442920632660389, "timestamp": "2025-09-04 04:17:28.767276", "step": 3544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:17:28.873521", "step": 3544, "epoch": 3 }, { "type": "loss", "content": 0.027983790263533592, "timestamp": "2025-09-04 04:17:28.895818", "step": 3545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:28.987931", "step": 3545, "epoch": 3 }, { "type": "loss", "content": 0.0004234362568240613, "timestamp": "2025-09-04 04:17:29.004675", "step": 3546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:17:29.090019", "step": 3546, "epoch": 3 }, { "type": "loss", "content": 0.0007317407871596515, "timestamp": "2025-09-04 04:17:29.105455", "step": 3547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:29.208608", "step": 3547, "epoch": 3 }, { "type": "loss", "content": 0.00967488158494234, "timestamp": "2025-09-04 04:17:29.228533", "step": 3548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:17:29.321831", "step": 3548, "epoch": 3 }, { "type": "loss", "content": 0.02515988238155842, "timestamp": "2025-09-04 04:17:29.341037", "step": 3549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 04:17:29.514992", "step": 3549, "epoch": 3 }, { "type": "loss", "content": 0.000993523863144219, "timestamp": "2025-09-04 04:17:29.549641", "step": 3550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:29.645946", "step": 3550, "epoch": 3 }, { "type": "loss", "content": 0.000763634976465255, "timestamp": "2025-09-04 04:17:29.663104", "step": 3551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:17:29.747319", "step": 3551, "epoch": 3 }, { "type": "loss", "content": 0.000361975806299597, "timestamp": "2025-09-04 04:17:29.763321", "step": 3552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:17:29.846943", "step": 3552, "epoch": 3 }, { "type": "loss", "content": 0.0030570703092962503, "timestamp": "2025-09-04 04:17:29.864182", "step": 3553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:17:29.941189", "step": 3553, "epoch": 3 }, { "type": "loss", "content": 0.0028307621832937002, "timestamp": "2025-09-04 04:17:29.955338", "step": 3554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:30.060042", "step": 3554, "epoch": 3 }, { "type": "loss", "content": 0.0015227803960442543, "timestamp": "2025-09-04 04:17:30.079268", "step": 3555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:17:30.184667", "step": 3555, "epoch": 3 }, { "type": "loss", "content": 0.013519185595214367, "timestamp": "2025-09-04 04:17:30.202939", "step": 3556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:17:30.308622", "step": 3556, "epoch": 3 }, { "type": "loss", "content": 0.0007381472387351096, "timestamp": "2025-09-04 04:17:30.330915", "step": 3557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:17:30.534526", "step": 3557, "epoch": 3 }, { "type": "loss", "content": 0.0012235705507919192, "timestamp": "2025-09-04 04:17:30.573628", "step": 3558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:30.683910", "step": 3558, "epoch": 3 }, { "type": "loss", "content": 0.007621685042977333, "timestamp": "2025-09-04 04:17:30.703119", "step": 3559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:17:30.808101", "step": 3559, "epoch": 3 }, { "type": "loss", "content": 0.0009364414145238698, "timestamp": "2025-09-04 04:17:30.827407", "step": 3560, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:17:39.214990", "step": 3560, "epoch": 3 }, { "type": "pplx", "content": 289.38046074506144, "timestamp": "2025-09-04 04:17:39.216730", "step": 3560, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3560", "timestamp": "2025-09-04 04:17:39.578194", "step": 3560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:17:39.694895", "step": 3560, "epoch": 3 }, { "type": "loss", "content": 0.09077467024326324, "timestamp": "2025-09-04 04:17:39.718681", "step": 3561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:39.822623", "step": 3561, "epoch": 3 }, { "type": "loss", "content": 0.001359087647870183, "timestamp": "2025-09-04 04:17:39.841874", "step": 3562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:39.944941", "step": 3562, "epoch": 3 }, { "type": "loss", "content": 0.0016831067623570561, "timestamp": "2025-09-04 04:17:39.964001", "step": 3563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:17:40.059536", "step": 3563, "epoch": 3 }, { "type": "loss", "content": 0.04928234592080116, "timestamp": "2025-09-04 04:17:40.077782", "step": 3564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:17:40.170060", "step": 3564, "epoch": 3 }, { "type": "loss", "content": 0.02024707943201065, "timestamp": "2025-09-04 04:17:40.189078", "step": 3565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:40.289261", "step": 3565, "epoch": 3 }, { "type": "loss", "content": 0.02083570696413517, "timestamp": "2025-09-04 04:17:40.308255", "step": 3566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:40.409586", "step": 3566, "epoch": 3 }, { "type": "loss", "content": 0.006988304201513529, "timestamp": "2025-09-04 04:17:40.428713", "step": 3567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:40.532535", "step": 3567, "epoch": 3 }, { "type": "loss", "content": 0.005976350978016853, "timestamp": "2025-09-04 04:17:40.552587", "step": 3568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:17:40.625938", "step": 3568, "epoch": 3 }, { "type": "loss", "content": 0.006792505271732807, "timestamp": "2025-09-04 04:17:40.640617", "step": 3569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:40.742711", "step": 3569, "epoch": 3 }, { "type": "loss", "content": 0.004221724346280098, "timestamp": "2025-09-04 04:17:40.761826", "step": 3570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:40.863583", "step": 3570, "epoch": 3 }, { "type": "loss", "content": 0.0009288810542784631, "timestamp": "2025-09-04 04:17:40.882442", "step": 3571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:17:40.982227", "step": 3571, "epoch": 3 }, { "type": "loss", "content": 0.0005838017095811665, "timestamp": "2025-09-04 04:17:41.001628", "step": 3572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:41.090099", "step": 3572, "epoch": 3 }, { "type": "loss", "content": 0.0032299798913300037, "timestamp": "2025-09-04 04:17:41.108485", "step": 3573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:17:41.186326", "step": 3573, "epoch": 3 }, { "type": "loss", "content": 0.0007045165402814746, "timestamp": "2025-09-04 04:17:41.200322", "step": 3574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1472 ], "flops": 29440178786048.0 }, "timestamp": "2025-09-04 04:17:41.415846", "step": 3574, "epoch": 3 }, { "type": "loss", "content": 0.004880525637418032, "timestamp": "2025-09-04 04:17:41.456770", "step": 3575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:17:41.552296", "step": 3575, "epoch": 3 }, { "type": "loss", "content": 0.01930762641131878, "timestamp": "2025-09-04 04:17:41.570598", "step": 3576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:17:41.646968", "step": 3576, "epoch": 3 }, { "type": "loss", "content": 0.006385389715433121, "timestamp": "2025-09-04 04:17:41.662283", "step": 3577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:41.765485", "step": 3577, "epoch": 3 }, { "type": "loss", "content": 0.004767129663378, "timestamp": "2025-09-04 04:17:41.784700", "step": 3578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 864 ], "flops": 17280104967552.0 }, "timestamp": "2025-09-04 04:17:41.912097", "step": 3578, "epoch": 3 }, { "type": "loss", "content": 0.000834185048006475, "timestamp": "2025-09-04 04:17:41.936484", "step": 3579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:42.040776", "step": 3579, "epoch": 3 }, { "type": "loss", "content": 0.00024679276975803077, "timestamp": "2025-09-04 04:17:42.060779", "step": 3580, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:17:50.483560", "step": 3580, "epoch": 3 }, { "type": "pplx", "content": 284.2367375279463, "timestamp": "2025-09-04 04:17:50.485827", "step": 3580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:17:50.560931", "step": 3580, "epoch": 3 }, { "type": "loss", "content": 0.0006078935693949461, "timestamp": "2025-09-04 04:17:50.576216", "step": 3581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:17:50.681804", "step": 3581, "epoch": 3 }, { "type": "loss", "content": 0.0132825942710042, "timestamp": "2025-09-04 04:17:50.700909", "step": 3582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:50.804294", "step": 3582, "epoch": 3 }, { "type": "loss", "content": 0.012423519045114517, "timestamp": "2025-09-04 04:17:50.823176", "step": 3583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:17:50.903122", "step": 3583, "epoch": 3 }, { "type": "loss", "content": 0.00610441155731678, "timestamp": "2025-09-04 04:17:50.918030", "step": 3584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:51.016416", "step": 3584, "epoch": 3 }, { "type": "loss", "content": 0.03366504982113838, "timestamp": "2025-09-04 04:17:51.037152", "step": 3585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:17:51.145741", "step": 3585, "epoch": 3 }, { "type": "loss", "content": 0.0012228694977238774, "timestamp": "2025-09-04 04:17:51.166017", "step": 3586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:51.257368", "step": 3586, "epoch": 3 }, { "type": "loss", "content": 0.0005438943044282496, "timestamp": "2025-09-04 04:17:51.274200", "step": 3587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:17:51.362295", "step": 3587, "epoch": 3 }, { "type": "loss", "content": 0.007683582603931427, "timestamp": "2025-09-04 04:17:51.378679", "step": 3588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:17:51.469388", "step": 3588, "epoch": 3 }, { "type": "loss", "content": 0.0010738805867731571, "timestamp": "2025-09-04 04:17:51.487793", "step": 3589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:17:51.563545", "step": 3589, "epoch": 3 }, { "type": "loss", "content": 0.0031101806089282036, "timestamp": "2025-09-04 04:17:51.577287", "step": 3590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:51.676508", "step": 3590, "epoch": 3 }, { "type": "loss", "content": 0.09019530564546585, "timestamp": "2025-09-04 04:17:51.693617", "step": 3591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:51.794524", "step": 3591, "epoch": 3 }, { "type": "loss", "content": 0.17219799757003784, "timestamp": "2025-09-04 04:17:51.814168", "step": 3592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:17:51.915283", "step": 3592, "epoch": 3 }, { "type": "loss", "content": 0.005689322482794523, "timestamp": "2025-09-04 04:17:51.936352", "step": 3593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:52.032479", "step": 3593, "epoch": 3 }, { "type": "loss", "content": 0.018983401358127594, "timestamp": "2025-09-04 04:17:52.049611", "step": 3594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:52.150681", "step": 3594, "epoch": 3 }, { "type": "loss", "content": 0.0016854063142091036, "timestamp": "2025-09-04 04:17:52.169545", "step": 3595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:17:52.277625", "step": 3595, "epoch": 3 }, { "type": "loss", "content": 0.0018568774685263634, "timestamp": "2025-09-04 04:17:52.298316", "step": 3596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:52.396728", "step": 3596, "epoch": 3 }, { "type": "loss", "content": 0.0010966422269120812, "timestamp": "2025-09-04 04:17:52.417397", "step": 3597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:17:52.510543", "step": 3597, "epoch": 3 }, { "type": "loss", "content": 0.005582905374467373, "timestamp": "2025-09-04 04:17:52.527659", "step": 3598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:17:52.630415", "step": 3598, "epoch": 3 }, { "type": "loss", "content": 0.006567910313606262, "timestamp": "2025-09-04 04:17:52.649399", "step": 3599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:17:52.760999", "step": 3599, "epoch": 3 }, { "type": "loss", "content": 0.021000593900680542, "timestamp": "2025-09-04 04:17:52.782422", "step": 3600, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:01.223292", "step": 3600, "epoch": 3 }, { "type": "pplx", "content": 273.946033699607, "timestamp": "2025-09-04 04:18:01.225358", "step": 3600, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3600", "timestamp": "2025-09-04 04:18:01.586167", "step": 3600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:18:01.659422", "step": 3600, "epoch": 3 }, { "type": "loss", "content": 0.009026916697621346, "timestamp": "2025-09-04 04:18:01.674374", "step": 3601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:01.776530", "step": 3601, "epoch": 3 }, { "type": "loss", "content": 0.0015143795171752572, "timestamp": "2025-09-04 04:18:01.795323", "step": 3602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:18:01.880426", "step": 3602, "epoch": 3 }, { "type": "loss", "content": 0.012896225787699223, "timestamp": "2025-09-04 04:18:01.895890", "step": 3603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:18:01.989210", "step": 3603, "epoch": 3 }, { "type": "loss", "content": 0.014396066777408123, "timestamp": "2025-09-04 04:18:02.007200", "step": 3604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:02.097931", "step": 3604, "epoch": 3 }, { "type": "loss", "content": 0.0015026642940938473, "timestamp": "2025-09-04 04:18:02.117140", "step": 3605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:02.208664", "step": 3605, "epoch": 3 }, { "type": "loss", "content": 0.0049162269569933414, "timestamp": "2025-09-04 04:18:02.225558", "step": 3606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:02.329504", "step": 3606, "epoch": 3 }, { "type": "loss", "content": 0.0029911664314568043, "timestamp": "2025-09-04 04:18:02.348781", "step": 3607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:18:02.458731", "step": 3607, "epoch": 3 }, { "type": "loss", "content": 0.005600781179964542, "timestamp": "2025-09-04 04:18:02.480005", "step": 3608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:18:02.570782", "step": 3608, "epoch": 3 }, { "type": "loss", "content": 0.0017589996568858624, "timestamp": "2025-09-04 04:18:02.589673", "step": 3609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:02.692776", "step": 3609, "epoch": 3 }, { "type": "loss", "content": 0.0030466015450656414, "timestamp": "2025-09-04 04:18:02.711967", "step": 3610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:02.815915", "step": 3610, "epoch": 3 }, { "type": "loss", "content": 0.000875060330145061, "timestamp": "2025-09-04 04:18:02.835190", "step": 3611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:02.930579", "step": 3611, "epoch": 3 }, { "type": "loss", "content": 0.000565591617487371, "timestamp": "2025-09-04 04:18:02.948740", "step": 3612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:03.048836", "step": 3612, "epoch": 3 }, { "type": "loss", "content": 0.001949289464391768, "timestamp": "2025-09-04 04:18:03.069950", "step": 3613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:18:03.179274", "step": 3613, "epoch": 3 }, { "type": "loss", "content": 0.002045362489297986, "timestamp": "2025-09-04 04:18:03.199578", "step": 3614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:03.303512", "step": 3614, "epoch": 3 }, { "type": "loss", "content": 0.025945372879505157, "timestamp": "2025-09-04 04:18:03.322728", "step": 3615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:18:03.409357", "step": 3615, "epoch": 3 }, { "type": "loss", "content": 0.003748838324099779, "timestamp": "2025-09-04 04:18:03.425852", "step": 3616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:18:03.508941", "step": 3616, "epoch": 3 }, { "type": "loss", "content": 0.031195957213640213, "timestamp": "2025-09-04 04:18:03.525966", "step": 3617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:03.617028", "step": 3617, "epoch": 3 }, { "type": "loss", "content": 0.0037481507752090693, "timestamp": "2025-09-04 04:18:03.633901", "step": 3618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:03.734574", "step": 3618, "epoch": 3 }, { "type": "loss", "content": 0.0022598407231271267, "timestamp": "2025-09-04 04:18:03.753419", "step": 3619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:03.856714", "step": 3619, "epoch": 3 }, { "type": "loss", "content": 0.017032302916049957, "timestamp": "2025-09-04 04:18:03.876386", "step": 3620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:12.287116", "step": 3620, "epoch": 3 }, { "type": "pplx", "content": 259.11186362945296, "timestamp": "2025-09-04 04:18:12.288980", "step": 3620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:18:12.368787", "step": 3620, "epoch": 3 }, { "type": "loss", "content": 0.005365411285310984, "timestamp": "2025-09-04 04:18:12.385332", "step": 3621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:18:12.495586", "step": 3621, "epoch": 3 }, { "type": "loss", "content": 0.03964783623814583, "timestamp": "2025-09-04 04:18:12.515826", "step": 3622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:18:12.651939", "step": 3622, "epoch": 3 }, { "type": "loss", "content": 0.005470898933708668, "timestamp": "2025-09-04 04:18:12.678090", "step": 3623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:12.779972", "step": 3623, "epoch": 3 }, { "type": "loss", "content": 0.0020204551983624697, "timestamp": "2025-09-04 04:18:12.799866", "step": 3624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:18:12.903116", "step": 3624, "epoch": 3 }, { "type": "loss", "content": 0.004941796418279409, "timestamp": "2025-09-04 04:18:12.924916", "step": 3625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:13.028080", "step": 3625, "epoch": 3 }, { "type": "loss", "content": 0.00373165775090456, "timestamp": "2025-09-04 04:18:13.047297", "step": 3626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:13.146084", "step": 3626, "epoch": 3 }, { "type": "loss", "content": 0.00431425916031003, "timestamp": "2025-09-04 04:18:13.164715", "step": 3627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:18:13.247659", "step": 3627, "epoch": 3 }, { "type": "loss", "content": 0.009402657859027386, "timestamp": "2025-09-04 04:18:13.263508", "step": 3628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:18:13.370905", "step": 3628, "epoch": 3 }, { "type": "loss", "content": 0.0007547914865426719, "timestamp": "2025-09-04 04:18:13.393375", "step": 3629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:18:13.487383", "step": 3629, "epoch": 3 }, { "type": "loss", "content": 0.0023410057183355093, "timestamp": "2025-09-04 04:18:13.504478", "step": 3630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:13.605540", "step": 3630, "epoch": 3 }, { "type": "loss", "content": 0.008979837410151958, "timestamp": "2025-09-04 04:18:13.624342", "step": 3631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1168 ], "flops": 23360141876800.0 }, "timestamp": "2025-09-04 04:18:13.800587", "step": 3631, "epoch": 3 }, { "type": "loss", "content": 0.0013831626856699586, "timestamp": "2025-09-04 04:18:13.833973", "step": 3632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:18:13.941465", "step": 3632, "epoch": 3 }, { "type": "loss", "content": 0.000762230425607413, "timestamp": "2025-09-04 04:18:13.964016", "step": 3633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:14.067950", "step": 3633, "epoch": 3 }, { "type": "loss", "content": 0.001655717147514224, "timestamp": "2025-09-04 04:18:14.087177", "step": 3634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:18:14.165208", "step": 3634, "epoch": 3 }, { "type": "loss", "content": 0.00832951907068491, "timestamp": "2025-09-04 04:18:14.179160", "step": 3635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:14.283512", "step": 3635, "epoch": 3 }, { "type": "loss", "content": 0.02003934420645237, "timestamp": "2025-09-04 04:18:14.303559", "step": 3636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:14.404763", "step": 3636, "epoch": 3 }, { "type": "loss", "content": 0.003556882031261921, "timestamp": "2025-09-04 04:18:14.425768", "step": 3637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:18:14.536861", "step": 3637, "epoch": 3 }, { "type": "loss", "content": 0.010267372243106365, "timestamp": "2025-09-04 04:18:14.557550", "step": 3638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:14.659506", "step": 3638, "epoch": 3 }, { "type": "loss", "content": 0.004329850431531668, "timestamp": "2025-09-04 04:18:14.678378", "step": 3639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:14.779574", "step": 3639, "epoch": 3 }, { "type": "loss", "content": 0.0016127810813486576, "timestamp": "2025-09-04 04:18:14.799189", "step": 3640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:23.369155", "step": 3640, "epoch": 3 }, { "type": "pplx", "content": 255.24736725474492, "timestamp": "2025-09-04 04:18:23.373987", "step": 3640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3640", "timestamp": "2025-09-04 04:18:23.755326", "step": 3640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:18:23.832501", "step": 3640, "epoch": 3 }, { "type": "loss", "content": 0.0019840069580823183, "timestamp": "2025-09-04 04:18:23.847882", "step": 3641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:23.950295", "step": 3641, "epoch": 3 }, { "type": "loss", "content": 0.011246290989220142, "timestamp": "2025-09-04 04:18:23.969312", "step": 3642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 880 ], "flops": 17600106910144.0 }, "timestamp": "2025-09-04 04:18:24.100055", "step": 3642, "epoch": 3 }, { "type": "loss", "content": 0.0008128046174533665, "timestamp": "2025-09-04 04:18:24.123694", "step": 3643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:24.217210", "step": 3643, "epoch": 3 }, { "type": "loss", "content": 0.002894132863730192, "timestamp": "2025-09-04 04:18:24.234773", "step": 3644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1008 ], "flops": 20160122450880.0 }, "timestamp": "2025-09-04 04:18:24.379481", "step": 3644, "epoch": 3 }, { "type": "loss", "content": 0.0007267083274200559, "timestamp": "2025-09-04 04:18:24.410597", "step": 3645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:24.514114", "step": 3645, "epoch": 3 }, { "type": "loss", "content": 0.003020522417500615, "timestamp": "2025-09-04 04:18:24.533436", "step": 3646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:18:24.642738", "step": 3646, "epoch": 3 }, { "type": "loss", "content": 0.007278635632246733, "timestamp": "2025-09-04 04:18:24.663068", "step": 3647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:24.762487", "step": 3647, "epoch": 3 }, { "type": "loss", "content": 0.020761605352163315, "timestamp": "2025-09-04 04:18:24.781998", "step": 3648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:24.885520", "step": 3648, "epoch": 3 }, { "type": "loss", "content": 0.0007095049950294197, "timestamp": "2025-09-04 04:18:24.906792", "step": 3649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:25.028566", "step": 3649, "epoch": 3 }, { "type": "loss", "content": 0.002251858590170741, "timestamp": "2025-09-04 04:18:25.047729", "step": 3650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:18:25.126742", "step": 3650, "epoch": 3 }, { "type": "loss", "content": 0.0013295934768393636, "timestamp": "2025-09-04 04:18:25.140980", "step": 3651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:25.231776", "step": 3651, "epoch": 3 }, { "type": "loss", "content": 0.023495763540267944, "timestamp": "2025-09-04 04:18:25.249318", "step": 3652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:18:25.366140", "step": 3652, "epoch": 3 }, { "type": "loss", "content": 0.012384490109980106, "timestamp": "2025-09-04 04:18:25.390466", "step": 3653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:25.484835", "step": 3653, "epoch": 3 }, { "type": "loss", "content": 0.0022683092392981052, "timestamp": "2025-09-04 04:18:25.502380", "step": 3654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:18:25.577592", "step": 3654, "epoch": 3 }, { "type": "loss", "content": 0.0017986752791330218, "timestamp": "2025-09-04 04:18:25.591074", "step": 3655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:25.694822", "step": 3655, "epoch": 3 }, { "type": "loss", "content": 0.007066111546009779, "timestamp": "2025-09-04 04:18:25.714876", "step": 3656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:25.811673", "step": 3656, "epoch": 3 }, { "type": "loss", "content": 0.002684858627617359, "timestamp": "2025-09-04 04:18:25.832055", "step": 3657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 928 ], "flops": 18560112737920.0 }, "timestamp": "2025-09-04 04:18:25.966783", "step": 3657, "epoch": 3 }, { "type": "loss", "content": 0.001188501249998808, "timestamp": "2025-09-04 04:18:25.992867", "step": 3658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:26.095363", "step": 3658, "epoch": 3 }, { "type": "loss", "content": 0.005962354131042957, "timestamp": "2025-09-04 04:18:26.114338", "step": 3659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:18:26.201582", "step": 3659, "epoch": 3 }, { "type": "loss", "content": 0.0010241111740469933, "timestamp": "2025-09-04 04:18:26.218028", "step": 3660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:34.655558", "step": 3660, "epoch": 3 }, { "type": "pplx", "content": 260.65734102692426, "timestamp": "2025-09-04 04:18:34.657505", "step": 3660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:34.756784", "step": 3660, "epoch": 3 }, { "type": "loss", "content": 0.0069060372188687325, "timestamp": "2025-09-04 04:18:34.778088", "step": 3661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:18:34.856779", "step": 3661, "epoch": 3 }, { "type": "loss", "content": 0.0014873266918584704, "timestamp": "2025-09-04 04:18:34.871044", "step": 3662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:18:34.948854", "step": 3662, "epoch": 3 }, { "type": "loss", "content": 0.005059961229562759, "timestamp": "2025-09-04 04:18:34.963083", "step": 3663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:18:35.081870", "step": 3663, "epoch": 3 }, { "type": "loss", "content": 0.00022246531443670392, "timestamp": "2025-09-04 04:18:35.103259", "step": 3664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:18:35.206724", "step": 3664, "epoch": 3 }, { "type": "loss", "content": 0.005162604618817568, "timestamp": "2025-09-04 04:18:35.222293", "step": 3665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:35.326289", "step": 3665, "epoch": 3 }, { "type": "loss", "content": 0.0015102955512702465, "timestamp": "2025-09-04 04:18:35.345704", "step": 3666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:18:35.452901", "step": 3666, "epoch": 3 }, { "type": "loss", "content": 0.0005610152729786932, "timestamp": "2025-09-04 04:18:35.472972", "step": 3667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:35.578242", "step": 3667, "epoch": 3 }, { "type": "loss", "content": 0.002261679619550705, "timestamp": "2025-09-04 04:18:35.598405", "step": 3668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:18:35.706464", "step": 3668, "epoch": 3 }, { "type": "loss", "content": 0.009873145259916782, "timestamp": "2025-09-04 04:18:35.728526", "step": 3669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:35.822179", "step": 3669, "epoch": 3 }, { "type": "loss", "content": 0.0006896741688251495, "timestamp": "2025-09-04 04:18:35.839074", "step": 3670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:35.933182", "step": 3670, "epoch": 3 }, { "type": "loss", "content": 0.0011464201379567385, "timestamp": "2025-09-04 04:18:35.950685", "step": 3671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:36.044556", "step": 3671, "epoch": 3 }, { "type": "loss", "content": 0.001933246268890798, "timestamp": "2025-09-04 04:18:36.062058", "step": 3672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:18:36.153756", "step": 3672, "epoch": 3 }, { "type": "loss", "content": 0.005047465208917856, "timestamp": "2025-09-04 04:18:36.173094", "step": 3673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:18:36.264431", "step": 3673, "epoch": 3 }, { "type": "loss", "content": 0.01326705701649189, "timestamp": "2025-09-04 04:18:36.279731", "step": 3674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:36.390540", "step": 3674, "epoch": 3 }, { "type": "loss", "content": 0.001728372648358345, "timestamp": "2025-09-04 04:18:36.409184", "step": 3675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:36.511047", "step": 3675, "epoch": 3 }, { "type": "loss", "content": 0.00010324228787794709, "timestamp": "2025-09-04 04:18:36.531104", "step": 3676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:36.631106", "step": 3676, "epoch": 3 }, { "type": "loss", "content": 0.0006894250982441008, "timestamp": "2025-09-04 04:18:36.651977", "step": 3677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:18:36.737949", "step": 3677, "epoch": 3 }, { "type": "loss", "content": 0.0178501196205616, "timestamp": "2025-09-04 04:18:36.753638", "step": 3678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:36.847722", "step": 3678, "epoch": 3 }, { "type": "loss", "content": 0.004994639195501804, "timestamp": "2025-09-04 04:18:36.865037", "step": 3679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:36.970830", "step": 3679, "epoch": 3 }, { "type": "loss", "content": 9.363189019495621e-05, "timestamp": "2025-09-04 04:18:36.991002", "step": 3680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:45.564502", "step": 3680, "epoch": 3 }, { "type": "pplx", "content": 269.183933665212, "timestamp": "2025-09-04 04:18:45.567064", "step": 3680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3680", "timestamp": "2025-09-04 04:18:45.962752", "step": 3680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:46.060324", "step": 3680, "epoch": 3 }, { "type": "loss", "content": 0.016621418297290802, "timestamp": "2025-09-04 04:18:46.080876", "step": 3681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:18:46.188302", "step": 3681, "epoch": 3 }, { "type": "loss", "content": 0.010536265559494495, "timestamp": "2025-09-04 04:18:46.208241", "step": 3682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:46.313573", "step": 3682, "epoch": 3 }, { "type": "loss", "content": 0.010933700948953629, "timestamp": "2025-09-04 04:18:46.332960", "step": 3683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1424 ], "flops": 28480172958272.0 }, "timestamp": "2025-09-04 04:18:46.545657", "step": 3683, "epoch": 3 }, { "type": "loss", "content": 0.006731206551194191, "timestamp": "2025-09-04 04:18:46.587241", "step": 3684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:46.686304", "step": 3684, "epoch": 3 }, { "type": "loss", "content": 0.017399858683347702, "timestamp": "2025-09-04 04:18:46.706662", "step": 3685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:46.797807", "step": 3685, "epoch": 3 }, { "type": "loss", "content": 0.0017027389258146286, "timestamp": "2025-09-04 04:18:46.814735", "step": 3686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:46.909873", "step": 3686, "epoch": 3 }, { "type": "loss", "content": 0.0014917801599949598, "timestamp": "2025-09-04 04:18:46.927291", "step": 3687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:47.032604", "step": 3687, "epoch": 3 }, { "type": "loss", "content": 0.007048693019896746, "timestamp": "2025-09-04 04:18:47.052648", "step": 3688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:18:47.134919", "step": 3688, "epoch": 3 }, { "type": "loss", "content": 0.023266077041625977, "timestamp": "2025-09-04 04:18:47.151532", "step": 3689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:18:47.260045", "step": 3689, "epoch": 3 }, { "type": "loss", "content": 0.005440382286906242, "timestamp": "2025-09-04 04:18:47.280442", "step": 3690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:47.392196", "step": 3690, "epoch": 3 }, { "type": "loss", "content": 0.005371682345867157, "timestamp": "2025-09-04 04:18:47.411099", "step": 3691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:18:47.506234", "step": 3691, "epoch": 3 }, { "type": "loss", "content": 0.03604043647646904, "timestamp": "2025-09-04 04:18:47.524627", "step": 3692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:18:47.598228", "step": 3692, "epoch": 3 }, { "type": "loss", "content": 0.016086634248495102, "timestamp": "2025-09-04 04:18:47.613044", "step": 3693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:18:47.712403", "step": 3693, "epoch": 3 }, { "type": "loss", "content": 0.003654019208624959, "timestamp": "2025-09-04 04:18:47.731016", "step": 3694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:18:47.849362", "step": 3694, "epoch": 3 }, { "type": "loss", "content": 0.0030856921803206205, "timestamp": "2025-09-04 04:18:47.871448", "step": 3695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:47.976083", "step": 3695, "epoch": 3 }, { "type": "loss", "content": 0.0017932873452082276, "timestamp": "2025-09-04 04:18:47.996135", "step": 3696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:18:48.139248", "step": 3696, "epoch": 3 }, { "type": "loss", "content": 0.0021279409993439913, "timestamp": "2025-09-04 04:18:48.170357", "step": 3697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:18:48.278683", "step": 3697, "epoch": 3 }, { "type": "loss", "content": 0.0009123628260567784, "timestamp": "2025-09-04 04:18:48.299080", "step": 3698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:18:48.395269", "step": 3698, "epoch": 3 }, { "type": "loss", "content": 0.00036065455060452223, "timestamp": "2025-09-04 04:18:48.412856", "step": 3699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:18:48.507808", "step": 3699, "epoch": 3 }, { "type": "loss", "content": 0.019889622926712036, "timestamp": "2025-09-04 04:18:48.525968", "step": 3700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:18:57.005155", "step": 3700, "epoch": 3 }, { "type": "pplx", "content": 276.00496979251966, "timestamp": "2025-09-04 04:18:57.008172", "step": 3700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:57.106990", "step": 3700, "epoch": 3 }, { "type": "loss", "content": 0.03130375221371651, "timestamp": "2025-09-04 04:18:57.127767", "step": 3701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:57.230808", "step": 3701, "epoch": 3 }, { "type": "loss", "content": 0.054532185196876526, "timestamp": "2025-09-04 04:18:57.249702", "step": 3702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:18:57.325769", "step": 3702, "epoch": 3 }, { "type": "loss", "content": 0.008000990375876427, "timestamp": "2025-09-04 04:18:57.339443", "step": 3703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:18:57.430429", "step": 3703, "epoch": 3 }, { "type": "loss", "content": 0.00035263263271190226, "timestamp": "2025-09-04 04:18:57.448000", "step": 3704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:18:57.554283", "step": 3704, "epoch": 3 }, { "type": "loss", "content": 0.003820637473836541, "timestamp": "2025-09-04 04:18:57.576978", "step": 3705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:18:57.672220", "step": 3705, "epoch": 3 }, { "type": "loss", "content": 0.006798545364290476, "timestamp": "2025-09-04 04:18:57.689846", "step": 3706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:18:57.793776", "step": 3706, "epoch": 3 }, { "type": "loss", "content": 0.0006438225973397493, "timestamp": "2025-09-04 04:18:57.813047", "step": 3707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:18:57.889419", "step": 3707, "epoch": 3 }, { "type": "loss", "content": 0.00427033007144928, "timestamp": "2025-09-04 04:18:57.903748", "step": 3708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:18:57.994336", "step": 3708, "epoch": 3 }, { "type": "loss", "content": 0.0027048927731812, "timestamp": "2025-09-04 04:18:58.013185", "step": 3709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:18:58.121817", "step": 3709, "epoch": 3 }, { "type": "loss", "content": 0.028957033529877663, "timestamp": "2025-09-04 04:18:58.140618", "step": 3710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:58.242805", "step": 3710, "epoch": 3 }, { "type": "loss", "content": 0.0006207419210113585, "timestamp": "2025-09-04 04:18:58.261928", "step": 3711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:18:58.372427", "step": 3711, "epoch": 3 }, { "type": "loss", "content": 0.011274183169007301, "timestamp": "2025-09-04 04:18:58.393827", "step": 3712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:18:58.474801", "step": 3712, "epoch": 3 }, { "type": "loss", "content": 0.001740701962262392, "timestamp": "2025-09-04 04:18:58.491381", "step": 3713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1184 ], "flops": 23680143819392.0 }, "timestamp": "2025-09-04 04:18:58.662835", "step": 3713, "epoch": 3 }, { "type": "loss", "content": 0.008133734576404095, "timestamp": "2025-09-04 04:18:58.697505", "step": 3714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:58.799806", "step": 3714, "epoch": 3 }, { "type": "loss", "content": 0.011398537084460258, "timestamp": "2025-09-04 04:18:58.819083", "step": 3715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:58.922406", "step": 3715, "epoch": 3 }, { "type": "loss", "content": 0.006214221939444542, "timestamp": "2025-09-04 04:18:58.942354", "step": 3716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:18:59.044963", "step": 3716, "epoch": 3 }, { "type": "loss", "content": 0.00310851470567286, "timestamp": "2025-09-04 04:18:59.066118", "step": 3717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:18:59.142839", "step": 3717, "epoch": 3 }, { "type": "loss", "content": 0.003634780179709196, "timestamp": "2025-09-04 04:18:59.156277", "step": 3718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:18:59.252361", "step": 3718, "epoch": 3 }, { "type": "loss", "content": 0.0004398180462885648, "timestamp": "2025-09-04 04:18:59.270038", "step": 3719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:18:59.357108", "step": 3719, "epoch": 3 }, { "type": "loss", "content": 0.0019110249122604728, "timestamp": "2025-09-04 04:18:59.373500", "step": 3720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:19:07.964079", "step": 3720, "epoch": 3 }, { "type": "pplx", "content": 281.6937485671799, "timestamp": "2025-09-04 04:19:07.966081", "step": 3720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3720", "timestamp": "2025-09-04 04:19:08.325947", "step": 3720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:19:08.399210", "step": 3720, "epoch": 3 }, { "type": "loss", "content": 0.012652603909373283, "timestamp": "2025-09-04 04:19:08.414237", "step": 3721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:19:08.508172", "step": 3721, "epoch": 3 }, { "type": "loss", "content": 0.007865807972848415, "timestamp": "2025-09-04 04:19:08.525527", "step": 3722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:08.628679", "step": 3722, "epoch": 3 }, { "type": "loss", "content": 0.03159695491194725, "timestamp": "2025-09-04 04:19:08.647870", "step": 3723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:08.735900", "step": 3723, "epoch": 3 }, { "type": "loss", "content": 0.004598596598953009, "timestamp": "2025-09-04 04:19:08.752357", "step": 3724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1232 ], "flops": 24640149647168.0 }, "timestamp": "2025-09-04 04:19:08.931081", "step": 3724, "epoch": 3 }, { "type": "loss", "content": 0.013268624432384968, "timestamp": "2025-09-04 04:19:08.968763", "step": 3725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:09.054834", "step": 3725, "epoch": 3 }, { "type": "loss", "content": 0.0011762577341869473, "timestamp": "2025-09-04 04:19:09.070399", "step": 3726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:09.173921", "step": 3726, "epoch": 3 }, { "type": "loss", "content": 0.04035170376300812, "timestamp": "2025-09-04 04:19:09.193294", "step": 3727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:09.292993", "step": 3727, "epoch": 3 }, { "type": "loss", "content": 0.000641504826489836, "timestamp": "2025-09-04 04:19:09.312305", "step": 3728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 944 ], "flops": 18880114680512.0 }, "timestamp": "2025-09-04 04:19:09.445418", "step": 3728, "epoch": 3 }, { "type": "loss", "content": 0.00042860963731072843, "timestamp": "2025-09-04 04:19:09.474369", "step": 3729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:09.561153", "step": 3729, "epoch": 3 }, { "type": "loss", "content": 0.01674291118979454, "timestamp": "2025-09-04 04:19:09.576784", "step": 3730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:19:09.687305", "step": 3730, "epoch": 3 }, { "type": "loss", "content": 0.04544749855995178, "timestamp": "2025-09-04 04:19:09.707970", "step": 3731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:19:09.809977", "step": 3731, "epoch": 3 }, { "type": "loss", "content": 0.0015480854781344533, "timestamp": "2025-09-04 04:19:09.830021", "step": 3732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:19:09.918419", "step": 3732, "epoch": 3 }, { "type": "loss", "content": 0.00010000986367231235, "timestamp": "2025-09-04 04:19:09.936774", "step": 3733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:10.037412", "step": 3733, "epoch": 3 }, { "type": "loss", "content": 0.014707312919199467, "timestamp": "2025-09-04 04:19:10.056314", "step": 3734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:19:10.139093", "step": 3734, "epoch": 3 }, { "type": "loss", "content": 0.0023547126911580563, "timestamp": "2025-09-04 04:19:10.153214", "step": 3735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:10.238659", "step": 3735, "epoch": 3 }, { "type": "loss", "content": 0.005836615804582834, "timestamp": "2025-09-04 04:19:10.254980", "step": 3736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:10.353590", "step": 3736, "epoch": 3 }, { "type": "loss", "content": 0.00020389581914059818, "timestamp": "2025-09-04 04:19:10.374400", "step": 3737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:10.478398", "step": 3737, "epoch": 3 }, { "type": "loss", "content": 0.0003900925803463906, "timestamp": "2025-09-04 04:19:10.497684", "step": 3738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:10.596881", "step": 3738, "epoch": 3 }, { "type": "loss", "content": 6.265490083023906e-05, "timestamp": "2025-09-04 04:19:10.615552", "step": 3739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 960 ], "flops": 19200116623104.0 }, "timestamp": "2025-09-04 04:19:10.753104", "step": 3739, "epoch": 3 }, { "type": "loss", "content": 0.04083415865898132, "timestamp": "2025-09-04 04:19:10.780282", "step": 3740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:19:19.338127", "step": 3740, "epoch": 3 }, { "type": "pplx", "content": 285.6040904345629, "timestamp": "2025-09-04 04:19:19.340007", "step": 3740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:19:19.457232", "step": 3740, "epoch": 3 }, { "type": "loss", "content": 0.0010513699380680919, "timestamp": "2025-09-04 04:19:19.482682", "step": 3741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1376 ], "flops": 27520167130496.0 }, "timestamp": "2025-09-04 04:19:19.687690", "step": 3741, "epoch": 3 }, { "type": "loss", "content": 0.0182512030005455, "timestamp": "2025-09-04 04:19:19.726797", "step": 3742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:19.838392", "step": 3742, "epoch": 3 }, { "type": "loss", "content": 0.0008120352867990732, "timestamp": "2025-09-04 04:19:19.859139", "step": 3743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:19:19.958398", "step": 3743, "epoch": 3 }, { "type": "loss", "content": 0.0015680500073358417, "timestamp": "2025-09-04 04:19:19.976612", "step": 3744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:19:20.080286", "step": 3744, "epoch": 3 }, { "type": "loss", "content": 0.007197872269898653, "timestamp": "2025-09-04 04:19:20.099097", "step": 3745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:19:20.227801", "step": 3745, "epoch": 3 }, { "type": "loss", "content": 0.011416764929890633, "timestamp": "2025-09-04 04:19:20.248322", "step": 3746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:20.402738", "step": 3746, "epoch": 3 }, { "type": "loss", "content": 0.004257888998836279, "timestamp": "2025-09-04 04:19:20.423364", "step": 3747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:20.563747", "step": 3747, "epoch": 3 }, { "type": "loss", "content": 0.004272022750228643, "timestamp": "2025-09-04 04:19:20.583768", "step": 3748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:19:20.696830", "step": 3748, "epoch": 3 }, { "type": "loss", "content": 0.0040796962566673756, "timestamp": "2025-09-04 04:19:20.716140", "step": 3749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:20.816046", "step": 3749, "epoch": 3 }, { "type": "loss", "content": 0.0027478632982820272, "timestamp": "2025-09-04 04:19:20.835598", "step": 3750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:19:20.984592", "step": 3750, "epoch": 3 }, { "type": "loss", "content": 0.0048896921798586845, "timestamp": "2025-09-04 04:19:21.003477", "step": 3751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:21.136153", "step": 3751, "epoch": 3 }, { "type": "loss", "content": 0.00022235576761886477, "timestamp": "2025-09-04 04:19:21.152425", "step": 3752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:19:21.253033", "step": 3752, "epoch": 3 }, { "type": "loss", "content": 0.009761957451701164, "timestamp": "2025-09-04 04:19:21.272287", "step": 3753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:21.379524", "step": 3753, "epoch": 3 }, { "type": "loss", "content": 0.006642633117735386, "timestamp": "2025-09-04 04:19:21.398711", "step": 3754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:21.513529", "step": 3754, "epoch": 3 }, { "type": "loss", "content": 0.0035070693120360374, "timestamp": "2025-09-04 04:19:21.534261", "step": 3755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:21.639442", "step": 3755, "epoch": 3 }, { "type": "loss", "content": 0.04340730234980583, "timestamp": "2025-09-04 04:19:21.659626", "step": 3756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:21.754047", "step": 3756, "epoch": 3 }, { "type": "loss", "content": 0.02027786523103714, "timestamp": "2025-09-04 04:19:21.771006", "step": 3757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 832 ], "flops": 16640101082368.0 }, "timestamp": "2025-09-04 04:19:21.906935", "step": 3757, "epoch": 3 }, { "type": "loss", "content": 0.029796713963150978, "timestamp": "2025-09-04 04:19:21.930003", "step": 3758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:22.044773", "step": 3758, "epoch": 3 }, { "type": "loss", "content": 0.003318408038467169, "timestamp": "2025-09-04 04:19:22.065227", "step": 3759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:22.205933", "step": 3759, "epoch": 3 }, { "type": "loss", "content": 0.00017196725821122527, "timestamp": "2025-09-04 04:19:22.228658", "step": 3760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:19:31.072582", "step": 3760, "epoch": 3 }, { "type": "pplx", "content": 286.6894182494486, "timestamp": "2025-09-04 04:19:31.075091", "step": 3760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3760", "timestamp": "2025-09-04 04:19:31.569145", "step": 3760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:19:31.696385", "step": 3760, "epoch": 3 }, { "type": "loss", "content": 0.031767360866069794, "timestamp": "2025-09-04 04:19:31.718702", "step": 3761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:19:31.824881", "step": 3761, "epoch": 3 }, { "type": "loss", "content": 0.008120999671518803, "timestamp": "2025-09-04 04:19:31.843846", "step": 3762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:19:31.923836", "step": 3762, "epoch": 3 }, { "type": "loss", "content": 0.04948921501636505, "timestamp": "2025-09-04 04:19:31.937734", "step": 3763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:32.043823", "step": 3763, "epoch": 3 }, { "type": "loss", "content": 0.0013902034843340516, "timestamp": "2025-09-04 04:19:32.063625", "step": 3764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:32.162247", "step": 3764, "epoch": 3 }, { "type": "loss", "content": 0.0031580275390297174, "timestamp": "2025-09-04 04:19:32.182508", "step": 3765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:19:32.277902", "step": 3765, "epoch": 3 }, { "type": "loss", "content": 0.0007031270652078092, "timestamp": "2025-09-04 04:19:32.295167", "step": 3766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:19:32.446073", "step": 3766, "epoch": 3 }, { "type": "loss", "content": 0.006634030491113663, "timestamp": "2025-09-04 04:19:32.474137", "step": 3767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:19:32.546675", "step": 3767, "epoch": 3 }, { "type": "loss", "content": 0.001953285885974765, "timestamp": "2025-09-04 04:19:32.559867", "step": 3768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:32.663220", "step": 3768, "epoch": 3 }, { "type": "loss", "content": 0.0026485491544008255, "timestamp": "2025-09-04 04:19:32.684306", "step": 3769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:19:32.796110", "step": 3769, "epoch": 3 }, { "type": "loss", "content": 0.0007451950805261731, "timestamp": "2025-09-04 04:19:32.816335", "step": 3770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1120 ], "flops": 22400136049024.0 }, "timestamp": "2025-09-04 04:19:32.980248", "step": 3770, "epoch": 3 }, { "type": "loss", "content": 0.00019487171084620059, "timestamp": "2025-09-04 04:19:33.012027", "step": 3771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:19:33.098017", "step": 3771, "epoch": 3 }, { "type": "loss", "content": 0.0070730033330619335, "timestamp": "2025-09-04 04:19:33.113870", "step": 3772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:33.217274", "step": 3772, "epoch": 3 }, { "type": "loss", "content": 0.0077186161652207375, "timestamp": "2025-09-04 04:19:33.238178", "step": 3773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 416 ], "flops": 8320050574976.0 }, "timestamp": "2025-09-04 04:19:33.310724", "step": 3773, "epoch": 3 }, { "type": "loss", "content": 0.003549723420292139, "timestamp": "2025-09-04 04:19:33.323108", "step": 3774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:33.436045", "step": 3774, "epoch": 3 }, { "type": "loss", "content": 0.026352064684033394, "timestamp": "2025-09-04 04:19:33.456603", "step": 3775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:33.544446", "step": 3775, "epoch": 3 }, { "type": "loss", "content": 0.007274407893419266, "timestamp": "2025-09-04 04:19:33.560495", "step": 3776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:19:33.663022", "step": 3776, "epoch": 3 }, { "type": "loss", "content": 0.005042992066591978, "timestamp": "2025-09-04 04:19:33.683851", "step": 3777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:33.785311", "step": 3777, "epoch": 3 }, { "type": "loss", "content": 0.00039034750079736114, "timestamp": "2025-09-04 04:19:33.803786", "step": 3778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:19:33.883644", "step": 3778, "epoch": 3 }, { "type": "loss", "content": 0.00030172173865139484, "timestamp": "2025-09-04 04:19:33.897563", "step": 3779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:19:34.007601", "step": 3779, "epoch": 3 }, { "type": "loss", "content": 0.017556700855493546, "timestamp": "2025-09-04 04:19:34.028541", "step": 3780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:19:42.961188", "step": 3780, "epoch": 3 }, { "type": "pplx", "content": 286.5459421244075, "timestamp": "2025-09-04 04:19:42.963267", "step": 3780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:43.045692", "step": 3780, "epoch": 3 }, { "type": "loss", "content": 0.012321644462645054, "timestamp": "2025-09-04 04:19:43.062957", "step": 3781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:19:43.157605", "step": 3781, "epoch": 3 }, { "type": "loss", "content": 0.002354747848585248, "timestamp": "2025-09-04 04:19:43.175092", "step": 3782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:19:43.282378", "step": 3782, "epoch": 3 }, { "type": "loss", "content": 0.026513388380408287, "timestamp": "2025-09-04 04:19:43.302750", "step": 3783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:19:43.378659", "step": 3783, "epoch": 3 }, { "type": "loss", "content": 0.003994217608124018, "timestamp": "2025-09-04 04:19:43.393283", "step": 3784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:19:43.496515", "step": 3784, "epoch": 3 }, { "type": "loss", "content": 0.001885236008092761, "timestamp": "2025-09-04 04:19:43.518555", "step": 3785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:43.630362", "step": 3785, "epoch": 3 }, { "type": "loss", "content": 0.011333000846207142, "timestamp": "2025-09-04 04:19:43.651043", "step": 3786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:19:43.744476", "step": 3786, "epoch": 3 }, { "type": "loss", "content": 0.0012821113923564553, "timestamp": "2025-09-04 04:19:43.761698", "step": 3787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:43.863342", "step": 3787, "epoch": 3 }, { "type": "loss", "content": 0.0025108223780989647, "timestamp": "2025-09-04 04:19:43.882925", "step": 3788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1088 ], "flops": 21760132163840.0 }, "timestamp": "2025-09-04 04:19:44.035332", "step": 3788, "epoch": 3 }, { "type": "loss", "content": 0.016976939514279366, "timestamp": "2025-09-04 04:19:44.069075", "step": 3789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:44.176089", "step": 3789, "epoch": 3 }, { "type": "loss", "content": 0.007931654341518879, "timestamp": "2025-09-04 04:19:44.195309", "step": 3790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:19:44.291098", "step": 3790, "epoch": 3 }, { "type": "loss", "content": 0.005856201983988285, "timestamp": "2025-09-04 04:19:44.308591", "step": 3791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:19:44.417595", "step": 3791, "epoch": 3 }, { "type": "loss", "content": 0.0056821079924702644, "timestamp": "2025-09-04 04:19:44.438676", "step": 3792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:19:44.523040", "step": 3792, "epoch": 3 }, { "type": "loss", "content": 0.00843026302754879, "timestamp": "2025-09-04 04:19:44.540136", "step": 3793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:44.633107", "step": 3793, "epoch": 3 }, { "type": "loss", "content": 0.009367075748741627, "timestamp": "2025-09-04 04:19:44.648653", "step": 3794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:44.749854", "step": 3794, "epoch": 3 }, { "type": "loss", "content": 0.01958874613046646, "timestamp": "2025-09-04 04:19:44.768794", "step": 3795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:19:44.863243", "step": 3795, "epoch": 3 }, { "type": "loss", "content": 0.003997324500232935, "timestamp": "2025-09-04 04:19:44.881646", "step": 3796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:44.986383", "step": 3796, "epoch": 3 }, { "type": "loss", "content": 0.00712942611426115, "timestamp": "2025-09-04 04:19:45.007700", "step": 3797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 800 ], "flops": 16000097197184.0 }, "timestamp": "2025-09-04 04:19:45.127722", "step": 3797, "epoch": 3 }, { "type": "loss", "content": 0.004085378255695105, "timestamp": "2025-09-04 04:19:45.149603", "step": 3798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:19:45.247231", "step": 3798, "epoch": 3 }, { "type": "loss", "content": 0.0033143635373562574, "timestamp": "2025-09-04 04:19:45.264092", "step": 3799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:19:45.363499", "step": 3799, "epoch": 3 }, { "type": "loss", "content": 0.01740036904811859, "timestamp": "2025-09-04 04:19:45.381789", "step": 3800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:19:53.897411", "step": 3800, "epoch": 3 }, { "type": "pplx", "content": 287.8095566615489, "timestamp": "2025-09-04 04:19:53.899499", "step": 3800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3800", "timestamp": "2025-09-04 04:19:54.373165", "step": 3800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 384 ], "flops": 7680046689792.0 }, "timestamp": "2025-09-04 04:19:54.434055", "step": 3800, "epoch": 3 }, { "type": "loss", "content": 0.004595032427459955, "timestamp": "2025-09-04 04:19:54.446095", "step": 3801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:19:54.529226", "step": 3801, "epoch": 3 }, { "type": "loss", "content": 0.005345925688743591, "timestamp": "2025-09-04 04:19:54.544393", "step": 3802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 896 ], "flops": 17920108852736.0 }, "timestamp": "2025-09-04 04:19:54.673012", "step": 3802, "epoch": 3 }, { "type": "loss", "content": 0.0032179898116737604, "timestamp": "2025-09-04 04:19:54.697760", "step": 3803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:54.784993", "step": 3803, "epoch": 3 }, { "type": "loss", "content": 0.011905638501048088, "timestamp": "2025-09-04 04:19:54.801510", "step": 3804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:54.909779", "step": 3804, "epoch": 3 }, { "type": "loss", "content": 0.00038323350599966943, "timestamp": "2025-09-04 04:19:54.932539", "step": 3805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:19:55.042815", "step": 3805, "epoch": 3 }, { "type": "loss", "content": 0.010173635557293892, "timestamp": "2025-09-04 04:19:55.063495", "step": 3806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:55.167220", "step": 3806, "epoch": 3 }, { "type": "loss", "content": 0.010474266484379768, "timestamp": "2025-09-04 04:19:55.186635", "step": 3807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:55.289309", "step": 3807, "epoch": 3 }, { "type": "loss", "content": 0.006612957455217838, "timestamp": "2025-09-04 04:19:55.309016", "step": 3808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:19:55.385129", "step": 3808, "epoch": 3 }, { "type": "loss", "content": 0.005502256099134684, "timestamp": "2025-09-04 04:19:55.400714", "step": 3809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:19:55.519741", "step": 3809, "epoch": 3 }, { "type": "loss", "content": 0.0005113132647238672, "timestamp": "2025-09-04 04:19:55.540420", "step": 3810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:19:55.643172", "step": 3810, "epoch": 3 }, { "type": "loss", "content": 0.004534454550594091, "timestamp": "2025-09-04 04:19:55.662408", "step": 3811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:55.766556", "step": 3811, "epoch": 3 }, { "type": "loss", "content": 0.0004610498435795307, "timestamp": "2025-09-04 04:19:55.786680", "step": 3812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:19:55.874100", "step": 3812, "epoch": 3 }, { "type": "loss", "content": 0.01628238335251808, "timestamp": "2025-09-04 04:19:55.892557", "step": 3813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:55.996550", "step": 3813, "epoch": 3 }, { "type": "loss", "content": 0.0012509598163887858, "timestamp": "2025-09-04 04:19:56.015759", "step": 3814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:56.116208", "step": 3814, "epoch": 3 }, { "type": "loss", "content": 0.002474145032465458, "timestamp": "2025-09-04 04:19:56.135188", "step": 3815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:19:56.239734", "step": 3815, "epoch": 3 }, { "type": "loss", "content": 0.0014434803742915392, "timestamp": "2025-09-04 04:19:56.259899", "step": 3816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 816 ], "flops": 16320099139776.0 }, "timestamp": "2025-09-04 04:19:56.379011", "step": 3816, "epoch": 3 }, { "type": "loss", "content": 0.0014426189009100199, "timestamp": "2025-09-04 04:19:56.404527", "step": 3817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 576 ], "flops": 11520070000896.0 }, "timestamp": "2025-09-04 04:19:56.491449", "step": 3817, "epoch": 3 }, { "type": "loss", "content": 0.005996841937303543, "timestamp": "2025-09-04 04:19:56.507188", "step": 3818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:19:56.606655", "step": 3818, "epoch": 3 }, { "type": "loss", "content": 0.0009320307872258127, "timestamp": "2025-09-04 04:19:56.625375", "step": 3819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:19:56.737655", "step": 3819, "epoch": 3 }, { "type": "loss", "content": 0.028978558257222176, "timestamp": "2025-09-04 04:19:56.757294", "step": 3820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:05.260868", "step": 3820, "epoch": 3 }, { "type": "pplx", "content": 290.2094912937652, "timestamp": "2025-09-04 04:20:05.263045", "step": 3820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:05.362272", "step": 3820, "epoch": 3 }, { "type": "loss", "content": 0.0015448291087523103, "timestamp": "2025-09-04 04:20:05.383584", "step": 3821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:20:05.479527", "step": 3821, "epoch": 3 }, { "type": "loss", "content": 0.013662063516676426, "timestamp": "2025-09-04 04:20:05.497092", "step": 3822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:05.601600", "step": 3822, "epoch": 3 }, { "type": "loss", "content": 0.003193710930645466, "timestamp": "2025-09-04 04:20:05.620767", "step": 3823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:20:05.722416", "step": 3823, "epoch": 3 }, { "type": "loss", "content": 0.026605522260069847, "timestamp": "2025-09-04 04:20:05.741885", "step": 3824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:20:05.840824", "step": 3824, "epoch": 3 }, { "type": "loss", "content": 0.0024066600017249584, "timestamp": "2025-09-04 04:20:05.861635", "step": 3825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:05.965300", "step": 3825, "epoch": 3 }, { "type": "loss", "content": 0.0002010101597988978, "timestamp": "2025-09-04 04:20:05.984532", "step": 3826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:20:06.093413", "step": 3826, "epoch": 3 }, { "type": "loss", "content": 0.01071922481060028, "timestamp": "2025-09-04 04:20:06.113643", "step": 3827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:20:06.231680", "step": 3827, "epoch": 3 }, { "type": "loss", "content": 0.00014394304889719933, "timestamp": "2025-09-04 04:20:06.254710", "step": 3828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:20:06.346407", "step": 3828, "epoch": 3 }, { "type": "loss", "content": 0.001017340342514217, "timestamp": "2025-09-04 04:20:06.365595", "step": 3829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 512 ], "flops": 10240062230528.0 }, "timestamp": "2025-09-04 04:20:06.443920", "step": 3829, "epoch": 3 }, { "type": "loss", "content": 0.00646333210170269, "timestamp": "2025-09-04 04:20:06.458192", "step": 3830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:20:06.564966", "step": 3830, "epoch": 3 }, { "type": "loss", "content": 0.002317563397809863, "timestamp": "2025-09-04 04:20:06.585040", "step": 3831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:06.688740", "step": 3831, "epoch": 3 }, { "type": "loss", "content": 0.025038516148924828, "timestamp": "2025-09-04 04:20:06.708726", "step": 3832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:20:06.790921", "step": 3832, "epoch": 3 }, { "type": "loss", "content": 0.0008068301249295473, "timestamp": "2025-09-04 04:20:06.807722", "step": 3833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 784 ], "flops": 15680095254592.0 }, "timestamp": "2025-09-04 04:20:06.924763", "step": 3833, "epoch": 3 }, { "type": "loss", "content": 0.007588067092001438, "timestamp": "2025-09-04 04:20:06.946951", "step": 3834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:20:07.023668", "step": 3834, "epoch": 3 }, { "type": "loss", "content": 0.0007368748192675412, "timestamp": "2025-09-04 04:20:07.037543", "step": 3835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:20:07.144531", "step": 3835, "epoch": 3 }, { "type": "loss", "content": 0.00043095918954350054, "timestamp": "2025-09-04 04:20:07.165248", "step": 3836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:20:07.273966", "step": 3836, "epoch": 3 }, { "type": "loss", "content": 0.004333295859396458, "timestamp": "2025-09-04 04:20:07.296790", "step": 3837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:07.394901", "step": 3837, "epoch": 3 }, { "type": "loss", "content": 0.0017703445628285408, "timestamp": "2025-09-04 04:20:07.412129", "step": 3838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:20:07.495915", "step": 3838, "epoch": 3 }, { "type": "loss", "content": 0.03642124682664871, "timestamp": "2025-09-04 04:20:07.511279", "step": 3839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:07.604411", "step": 3839, "epoch": 3 }, { "type": "loss", "content": 0.015598982572555542, "timestamp": "2025-09-04 04:20:07.622411", "step": 3840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:16.109142", "step": 3840, "epoch": 3 }, { "type": "pplx", "content": 293.44611506915754, "timestamp": "2025-09-04 04:20:16.111391", "step": 3840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3840", "timestamp": "2025-09-04 04:20:16.624255", "step": 3840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:20:16.730224", "step": 3840, "epoch": 3 }, { "type": "loss", "content": 0.03652084618806839, "timestamp": "2025-09-04 04:20:16.752751", "step": 3841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:20:16.863261", "step": 3841, "epoch": 3 }, { "type": "loss", "content": 0.0017639618599787354, "timestamp": "2025-09-04 04:20:16.883817", "step": 3842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:20:16.978707", "step": 3842, "epoch": 3 }, { "type": "loss", "content": 0.011736215092241764, "timestamp": "2025-09-04 04:20:16.996332", "step": 3843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 432 ], "flops": 8640052517568.0 }, "timestamp": "2025-09-04 04:20:17.068336", "step": 3843, "epoch": 3 }, { "type": "loss", "content": 0.0006759539246559143, "timestamp": "2025-09-04 04:20:17.081947", "step": 3844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:20:17.165333", "step": 3844, "epoch": 3 }, { "type": "loss", "content": 0.001056056353263557, "timestamp": "2025-09-04 04:20:17.181929", "step": 3845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:20:17.282369", "step": 3845, "epoch": 3 }, { "type": "loss", "content": 0.00557122053578496, "timestamp": "2025-09-04 04:20:17.301087", "step": 3846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:20:17.402195", "step": 3846, "epoch": 3 }, { "type": "loss", "content": 0.010494154877960682, "timestamp": "2025-09-04 04:20:17.421155", "step": 3847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:17.525683", "step": 3847, "epoch": 3 }, { "type": "loss", "content": 0.0003855594841297716, "timestamp": "2025-09-04 04:20:17.545741", "step": 3848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:20:17.643547", "step": 3848, "epoch": 3 }, { "type": "loss", "content": 0.0037584335077553988, "timestamp": "2025-09-04 04:20:17.664083", "step": 3849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:20:17.772097", "step": 3849, "epoch": 3 }, { "type": "loss", "content": 0.008188803680241108, "timestamp": "2025-09-04 04:20:17.792209", "step": 3850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:17.898084", "step": 3850, "epoch": 3 }, { "type": "loss", "content": 0.0006697289645671844, "timestamp": "2025-09-04 04:20:17.917502", "step": 3851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:20:17.995980", "step": 3851, "epoch": 3 }, { "type": "loss", "content": 0.025056758895516396, "timestamp": "2025-09-04 04:20:18.010432", "step": 3852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:20:18.106741", "step": 3852, "epoch": 3 }, { "type": "loss", "content": 0.005154153797775507, "timestamp": "2025-09-04 04:20:18.125833", "step": 3853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 720 ], "flops": 14400087484224.0 }, "timestamp": "2025-09-04 04:20:18.232402", "step": 3853, "epoch": 3 }, { "type": "loss", "content": 0.009047658182680607, "timestamp": "2025-09-04 04:20:18.252461", "step": 3854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:20:18.339002", "step": 3854, "epoch": 3 }, { "type": "loss", "content": 0.010006698779761791, "timestamp": "2025-09-04 04:20:18.354353", "step": 3855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:18.458829", "step": 3855, "epoch": 3 }, { "type": "loss", "content": 0.00595315545797348, "timestamp": "2025-09-04 04:20:18.476860", "step": 3856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 592 ], "flops": 11840071943488.0 }, "timestamp": "2025-09-04 04:20:18.566251", "step": 3856, "epoch": 3 }, { "type": "loss", "content": 0.0007929496932774782, "timestamp": "2025-09-04 04:20:18.584632", "step": 3857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:20:18.661728", "step": 3857, "epoch": 3 }, { "type": "loss", "content": 0.0036627210211008787, "timestamp": "2025-09-04 04:20:18.675620", "step": 3858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:18.778203", "step": 3858, "epoch": 3 }, { "type": "loss", "content": 0.0008201138116419315, "timestamp": "2025-09-04 04:20:18.797517", "step": 3859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:18.900813", "step": 3859, "epoch": 3 }, { "type": "loss", "content": 0.0017376311589032412, "timestamp": "2025-09-04 04:20:18.920853", "step": 3860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:27.435062", "step": 3860, "epoch": 3 }, { "type": "pplx", "content": 294.3503277315976, "timestamp": "2025-09-04 04:20:27.437138", "step": 3860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:20:27.517188", "step": 3860, "epoch": 3 }, { "type": "loss", "content": 0.011900283396244049, "timestamp": "2025-09-04 04:20:27.533953", "step": 3861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:20:27.642933", "step": 3861, "epoch": 3 }, { "type": "loss", "content": 0.015471918508410454, "timestamp": "2025-09-04 04:20:27.663390", "step": 3862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:27.766648", "step": 3862, "epoch": 3 }, { "type": "loss", "content": 0.001793242641724646, "timestamp": "2025-09-04 04:20:27.785811", "step": 3863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:27.879737", "step": 3863, "epoch": 3 }, { "type": "loss", "content": 0.00023462541867047548, "timestamp": "2025-09-04 04:20:27.897669", "step": 3864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:20:28.002948", "step": 3864, "epoch": 3 }, { "type": "loss", "content": 0.005305502098053694, "timestamp": "2025-09-04 04:20:28.025333", "step": 3865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:20:28.145124", "step": 3865, "epoch": 3 }, { "type": "loss", "content": 0.03417619690299034, "timestamp": "2025-09-04 04:20:28.165784", "step": 3866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 1024 ], "flops": 20480124393472.0 }, "timestamp": "2025-09-04 04:20:28.313306", "step": 3866, "epoch": 3 }, { "type": "loss", "content": 0.007200147025287151, "timestamp": "2025-09-04 04:20:28.341721", "step": 3867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:20:28.443445", "step": 3867, "epoch": 3 }, { "type": "loss", "content": 0.04463130235671997, "timestamp": "2025-09-04 04:20:28.470677", "step": 3868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:28.618239", "step": 3868, "epoch": 3 }, { "type": "loss", "content": 0.00033794058253988624, "timestamp": "2025-09-04 04:20:28.638786", "step": 3869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 544 ], "flops": 10880066115712.0 }, "timestamp": "2025-09-04 04:20:28.741173", "step": 3869, "epoch": 3 }, { "type": "loss", "content": 0.09599971026182175, "timestamp": "2025-09-04 04:20:28.756389", "step": 3870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:28.860971", "step": 3870, "epoch": 3 }, { "type": "loss", "content": 0.00989855919033289, "timestamp": "2025-09-04 04:20:28.880352", "step": 3871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 480 ], "flops": 9600058345344.0 }, "timestamp": "2025-09-04 04:20:28.959284", "step": 3871, "epoch": 3 }, { "type": "loss", "content": 0.020102083683013916, "timestamp": "2025-09-04 04:20:28.973935", "step": 3872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:20:29.058460", "step": 3872, "epoch": 3 }, { "type": "loss", "content": 0.010750843212008476, "timestamp": "2025-09-04 04:20:29.075537", "step": 3873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 736 ], "flops": 14720089426816.0 }, "timestamp": "2025-09-04 04:20:29.187735", "step": 3873, "epoch": 3 }, { "type": "loss", "content": 0.024111559614539146, "timestamp": "2025-09-04 04:20:29.208011", "step": 3874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:20:29.322373", "step": 3874, "epoch": 3 }, { "type": "loss", "content": 0.0005178121500648558, "timestamp": "2025-09-04 04:20:29.342893", "step": 3875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:29.439124", "step": 3875, "epoch": 3 }, { "type": "loss", "content": 0.008165943436324596, "timestamp": "2025-09-04 04:20:29.457030", "step": 3876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:20:29.554137", "step": 3876, "epoch": 3 }, { "type": "loss", "content": 0.0027514963876456022, "timestamp": "2025-09-04 04:20:29.573166", "step": 3877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:29.679149", "step": 3877, "epoch": 3 }, { "type": "loss", "content": 0.007004078943282366, "timestamp": "2025-09-04 04:20:29.698344", "step": 3878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 464 ], "flops": 9280056402752.0 }, "timestamp": "2025-09-04 04:20:29.781681", "step": 3878, "epoch": 3 }, { "type": "loss", "content": 0.0011216630227863789, "timestamp": "2025-09-04 04:20:29.794983", "step": 3879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:20:29.883567", "step": 3879, "epoch": 3 }, { "type": "loss", "content": 0.008931388147175312, "timestamp": "2025-09-04 04:20:29.899790", "step": 3880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:38.519446", "step": 3880, "epoch": 3 }, { "type": "pplx", "content": 287.92952489903456, "timestamp": "2025-09-04 04:20:38.522894", "step": 3880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3880", "timestamp": "2025-09-04 04:20:39.031240", "step": 3880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 688 ], "flops": 13760083599040.0 }, "timestamp": "2025-09-04 04:20:39.130503", "step": 3880, "epoch": 3 }, { "type": "loss", "content": 0.0010395454009994864, "timestamp": "2025-09-04 04:20:39.151468", "step": 3881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 560 ], "flops": 11200068058304.0 }, "timestamp": "2025-09-04 04:20:39.237424", "step": 3881, "epoch": 3 }, { "type": "loss", "content": 0.004671165719628334, "timestamp": "2025-09-04 04:20:39.252861", "step": 3882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:20:39.348868", "step": 3882, "epoch": 3 }, { "type": "loss", "content": 0.004468918778002262, "timestamp": "2025-09-04 04:20:39.366431", "step": 3883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:39.461021", "step": 3883, "epoch": 3 }, { "type": "loss", "content": 0.00038716281414963305, "timestamp": "2025-09-04 04:20:39.478989", "step": 3884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 624 ], "flops": 12480075828672.0 }, "timestamp": "2025-09-04 04:20:39.580317", "step": 3884, "epoch": 3 }, { "type": "loss", "content": 0.0007503409287892282, "timestamp": "2025-09-04 04:20:39.599412", "step": 3885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:20:39.719678", "step": 3885, "epoch": 3 }, { "type": "loss", "content": 0.009064269252121449, "timestamp": "2025-09-04 04:20:39.738284", "step": 3886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 656 ], "flops": 13120079713856.0 }, "timestamp": "2025-09-04 04:20:39.837794", "step": 3886, "epoch": 3 }, { "type": "loss", "content": 0.00859552901238203, "timestamp": "2025-09-04 04:20:39.856340", "step": 3887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 640 ], "flops": 12800077771264.0 }, "timestamp": "2025-09-04 04:20:39.952242", "step": 3887, "epoch": 3 }, { "type": "loss", "content": 0.007069156970828772, "timestamp": "2025-09-04 04:20:39.970460", "step": 3888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:20:40.080145", "step": 3888, "epoch": 3 }, { "type": "loss", "content": 0.003085685195401311, "timestamp": "2025-09-04 04:20:40.102925", "step": 3889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 528 ], "flops": 10560064173120.0 }, "timestamp": "2025-09-04 04:20:40.188536", "step": 3889, "epoch": 3 }, { "type": "loss", "content": 0.013397028669714928, "timestamp": "2025-09-04 04:20:40.203557", "step": 3890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 704 ], "flops": 14080085541632.0 }, "timestamp": "2025-09-04 04:20:40.308276", "step": 3890, "epoch": 3 }, { "type": "loss", "content": 0.0029367466922849417, "timestamp": "2025-09-04 04:20:40.327712", "step": 3891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:20:40.430752", "step": 3891, "epoch": 3 }, { "type": "loss", "content": 0.0016781740123406053, "timestamp": "2025-09-04 04:20:40.450543", "step": 3892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:40.545165", "step": 3892, "epoch": 3 }, { "type": "loss", "content": 0.0011187720810994506, "timestamp": "2025-09-04 04:20:40.563906", "step": 3893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 768 ], "flops": 15360093312000.0 }, "timestamp": "2025-09-04 04:20:40.673482", "step": 3893, "epoch": 3 }, { "type": "loss", "content": 0.007320099975913763, "timestamp": "2025-09-04 04:20:40.694144", "step": 3894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 672 ], "flops": 13440081656448.0 }, "timestamp": "2025-09-04 04:20:40.805033", "step": 3894, "epoch": 3 }, { "type": "loss", "content": 0.0027296245098114014, "timestamp": "2025-09-04 04:20:40.824078", "step": 3895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 912 ], "flops": 18240110795328.0 }, "timestamp": "2025-09-04 04:20:40.958781", "step": 3895, "epoch": 3 }, { "type": "loss", "content": 0.0014986982569098473, "timestamp": "2025-09-04 04:20:40.984293", "step": 3896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:41.076680", "step": 3896, "epoch": 3 }, { "type": "loss", "content": 0.011014275252819061, "timestamp": "2025-09-04 04:20:41.095522", "step": 3897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 496 ], "flops": 9920060287936.0 }, "timestamp": "2025-09-04 04:20:41.174621", "step": 3897, "epoch": 3 }, { "type": "loss", "content": 0.0021178617607802153, "timestamp": "2025-09-04 04:20:41.188639", "step": 3898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 608 ], "flops": 12160073886080.0 }, "timestamp": "2025-09-04 04:20:41.283021", "step": 3898, "epoch": 3 }, { "type": "loss", "content": 0.03507016599178314, "timestamp": "2025-09-04 04:20:41.300244", "step": 3899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 752 ], "flops": 15040091369408.0 }, "timestamp": "2025-09-04 04:20:41.413224", "step": 3899, "epoch": 3 }, { "type": "loss", "content": 0.0063927737064659595, "timestamp": "2025-09-04 04:20:41.434668", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:49.925006", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 280.9776699653499, "timestamp": "2025-09-04 04:20:49.927474", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1200 ], "batch_size": 8, "flops": 23953716633984 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 1424 ], "batch_size": 8, "flops": 28425077059712 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 528 ], "batch_size": 8, "flops": 10539635356800 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 544 ], "batch_size": 8, "flops": 10859018244352 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 880 ], "batch_size": 8, "flops": 17566058882944 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 784 ], "batch_size": 8, "flops": 15649761557632 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 736 ], "batch_size": 8, "flops": 14691612894976 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 592 ], "batch_size": 8, "flops": 11817166907008 }, { "type": "perplexity", "in_batch_dim": [ 8, 576 ], "batch_size": 8, "flops": 11497784019456 }, { "type": "perplexity", "in_batch_dim": [ 8, 608 ], "batch_size": 8, "flops": 12136549794560 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 704 ], "batch_size": 8, "flops": 14052847119872 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 1344 ], "batch_size": 8, "flops": 26828162621952 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 624 ], "batch_size": 8, "flops": 12455932682112 }, { "type": "perplexity", "in_batch_dim": [ 8, 688 ], "batch_size": 8, "flops": 13733464232320 }, { "type": "perplexity", "in_batch_dim": [ 8, 1376 ], "batch_size": 8, "flops": 27466928397056 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 720 ], "batch_size": 8, "flops": 14372230007424 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 672 ], "batch_size": 8, "flops": 13414081344768 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 560 ], "batch_size": 8, "flops": 11178401131904 }, { "type": "perplexity", "in_batch_dim": [ 8, 640 ], "batch_size": 8, "flops": 12775315569664 }, { "type": "perplexity", "in_batch_dim": [ 8, 656 ], "batch_size": 8, "flops": 13094698457216 }, { "type": "perplexity", "in_batch_dim": [ 2, 384 ], "batch_size": 8, "flops": 7665189368832 } ], "timestamp": "2025-09-04 04:20:58.637186", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 280.9776699653499, "timestamp": "2025-09-04 04:20:58.649417", "step": 3900, "epoch": 3 }, { "type": "best_pplx", "content": 255.24736725474492, "timestamp": "2025-09-04 04:20:58.657551", "step": 3900, "epoch": 3 }, { "type": "best_step", "content": 3640, "timestamp": "2025-09-04 04:20:58.671847", "step": 3900, "epoch": 3 }, { "type": "total_pplx_flops", "content": 224996302651284480, "timestamp": "2025-09-04 04:20:58.684121", "step": 3900, "epoch": 3 }, { "type": "total_train_flops", "content": 5.201887604744794e+16, "timestamp": "2025-09-04 04:20:59.236260", "step": 3900, "epoch": 3 } ], "best_evals": { "pplx": { "score": 255.24736725474492, "step": 3640 }, "rouge1": { "precision": 0.8382830502830503, "recall": 0.827133089133089, "fmeasure": 0.8242266793036024 } } }